library(knitr)
knitr::opts_chunk$set(cache = TRUE, warning = FALSE,
message = FALSE, cache.lazy = FALSE, cache.path = "../../gitignore/RmdCache/", # keep heavy data in gitignore cache
fig.path = "Rfigures/Rmd/")
machine="mythinkpad" # define the machine we work on
loadALL = TRUE # load all uniteCov objects
loadannot = TRUE # load genome annotations
sourceDMS = TRUE # Load the calculated DMS
sourceSubUnite = FALSE
source("R02.3_DATALOAD.R")
Data getting loaded:
Compare fitness traits between the different offsprings groups. Follow up of Sagonas 2020 & Ferre Ortega’s master’s dissertation
Kaufmann et al. 2014: Body condition of the G2 fish, an estimate of fish health and a predictor of energy reserves and reproductive success, was calculated using there residuals from the regression of body mass on body length (Chellappaet al.1995).
fullMetadata_OFFS$BCI <- residuals(lmer(Wnettofin ~ Slfin * Sex + (1|brotherPairID), data=fullMetadata_OFFS))
## and for parents (no sex difference, only males):
fullMetadata_PAR$BCI <- residuals(lmer(Wnettofin ~ Slfin + (1|brotherPairID), data=fullMetadata_PAR))
Effect of paternal treatment on body condition of offspring: Kaufmann et al. 2014: “To investigate in which way paternal G1 exposure affected offspring tolerance, we tested how the relationship between G2 body condition and infection intensity was affected by paternal G1 exposure. This was tested in a linear mixed model on G2 body condition with paternal G1 treatment and the interaction between paternal G1 treatment and G2 infection intensity as fixed effects. Maternal half-sibship identity was set as a random effect”
Effect of treatment groups of offspring on body condition(Kaufmann et al. 2014): “The linear mixed effect model (nlme function in R) included G2 body condition as dependent variable, sex, G2 treatment (exposed vs. control), paternal G1 treatment (exposed vs. control) and their interactions as fixed effects as well as maternal G2 half-sibship identity as a random effect”
mod1 <- lme(BCI ~ offsTrt * patTrt, random=~1|brotherPairID,data=fullMetadata_OFFS)
anova(mod1) # strong significant effect of both offspring trt & paternal + interactions
## numDF denDF F-value p-value
## (Intercept) 1 100 0.000000 1.0000
## offsTrt 1 100 14.057746 0.0003
## patTrt 1 100 13.600342 0.0004
## offsTrt:patTrt 1 100 9.396573 0.0028
mod1.2 <- lme(BCI ~ trtG1G2, random=~1|brotherPairID,data=fullMetadata_OFFS)
## pairwise posthoc test
emmeans(mod1.2, list(pairwise ~ trtG1G2), adjust = "tukey")
## $`emmeans of trtG1G2`
## trtG1G2 emmean SE df lower.CL upper.CL
## NE_control 16.6 10.8 7 -8.87 42.0
## NE_exposed -57.7 11.0 7 -83.63 -31.8
## E_control 23.6 10.8 7 -1.85 49.0
## E_exposed 15.5 10.8 7 -9.90 41.0
##
## Degrees-of-freedom method: containment
## Confidence level used: 0.95
##
## $`pairwise differences of trtG1G2`
## 1 estimate SE df t.ratio p.value
## NE_control - NE_exposed 74.29 15.3 100 4.840 <.0001
## NE_control - E_control -7.03 15.2 100 -0.462 0.9671
## NE_control - E_exposed 1.03 15.2 100 0.067 0.9999
## NE_exposed - E_control -81.32 15.3 100 -5.298 <.0001
## NE_exposed - E_exposed -73.27 15.3 100 -4.773 <.0001
## E_control - E_exposed 8.05 15.2 100 0.529 0.9517
##
## Degrees-of-freedom method: containment
## P value adjustment: tukey method for comparing a family of 4 estimates
## Control father - treatment offspring has a strongly significantly lower BC than
## every other group, same as Kaufmann et al. 2014
myplot1 <- ggplot(fullMetadata_OFFS, aes(x=trtG1G2, y = BCI, fill=trtG1G2))+
geom_boxplot()+
geom_signif(comparisons = list(c("NE_control", "NE_exposed")),
map_signif_level=TRUE, annotations="***",
y_position = 150, tip_length = 0, vjust=0.4) +
geom_signif(comparisons = list(c("NE_exposed", "E_control")),
map_signif_level=TRUE, annotations="***",
y_position = 200, tip_length = 0, vjust=0.4) +
geom_signif(comparisons = list(c("NE_exposed", "E_exposed")),
map_signif_level=TRUE, annotations="***",
y_position = 250, tip_length = 0, vjust=0.4) +
scale_fill_manual(values = colOffs)+
theme_bw() + theme(legend.position = "none") +
ylab("Body Condition Index") +
scale_x_discrete(labels=c("NE_control" = "G1 control\nG2 control", "NE_exposed" = "G1 control\nG2 infected",
"E_control" = "G1 infected\nG2 control", "E_exposed" = "G1 infected\nG2 infected"),
name = NULL)
myplot1
mod_Tol <- lmer(BCI ~ No.Worms*PAT + (1|brotherPairID)+ (1|Sex), data=fullMetadata_OFFS, REML = F)
## Model selection:
step(mod_Tol, reduce.random = F) # Model found: full model
## Backward reduced random-effect table:
##
## Eliminated npar logLik AIC LRT Df Pr(>Chisq)
## <none> 7 -608.33 1230.7
## (1 | brotherPairID) 0 6 -608.33 1228.7 0 1 1
## (1 | Sex) 0 6 -608.33 1228.7 0 1 1
##
## Backward reduced fixed-effect table:
## Degrees of freedom method: Satterthwaite
##
## Eliminated Sum Sq Mean Sq NumDF DenDF F value Pr(>F)
## No.Worms:PAT 0 20660 20660 1 111 6.1283 0.01481 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Model found:
## BCI ~ No.Worms * PAT + (1 | brotherPairID) + (1 | Sex)
## The slope of BCI on nbrworms varies upon treatment
plot(ggpredict(mod_Tol, terms = c("No.Worms", "PAT")), add.data=T)+ theme_bw() +
ylab("Body Condition Index") + xlab("Number of worms") +
ggtitle("Predicted values of Body Condition Index in offspring")+
scale_color_manual(NULL, values = c("black", "red")) +
scale_fill_manual(NULL, values = c("black", "red")) +
scale_x_continuous(breaks = 0:10)+
geom_point(size=0)+ # to have color key in legend as point
guides(colour = guide_legend(override.aes = list(size=3,linetype=0, fill = NA)))
Calculate number of methylated sites, mean coverage, and residuals of methylated sites by covered sites (to account for coverage bias)
mycalcRMS <- function(myUniteCov, myMetaData){
percMethMat = methylKit::percMethylation(myUniteCov)
# create a dataframe with all info
percMethDF = data.frame(SampleID = colnames(percMethMat),
Nbr_methCpG = colSums(percMethMat>=70 & !is.na(percMethMat)), ## number of methylated sites
Nbr_coveredCpG = colSums(!is.na(percMethMat)), ## number of sites covered in this sample
Nbr_NOTcoveredCpG = colSums(is.na(percMethMat)),## number of sites NOT covered in this sample
MeanCoverage = colMeans(methylKit::getData(myUniteCov)[,myUniteCov@coverage.index], na.rm = T), ## coverage.index: vector denoting which columns in the data correspond to coverage values
OverallPercentageMethylation = colMeans(methylKit::percMethylation(myUniteCov), na.rm = T))
## RMS in this sample based on covered sites
percMethDF$RMS_coveredCpG = percMethDF$Nbr_methCpG / percMethDF$Nbr_coveredCpG
## merge with original metadata:
myMetaData = merge(myMetaData, percMethDF)
# calculate also RMS global, considering CpG covered or not (to compare)
myMetaData$RMS_allCpG_coveredOrNot = myMetaData$Nbr_methCpG / (myMetaData$M.Seqs_rawReads*10e6)
# calculate residuals of nbr of methCpG by nbr of covered CpG
myMetaData$res_Nbr_methCpG_Nbr_coveredCpG = residuals(
lm(myMetaData$Nbr_methCpG ~ myMetaData$Nbr_coveredCpG))
## REORDER myMetaData by sample ID
myMetaData = myMetaData[order(as.numeric(gsub("S", "", myMetaData$SampleID))),]
return(myMetaData)
}
fullMetadata <- mycalcRMS(uniteCovALL_woSexAndUnknowChr, fullMetadata)
fullMetadata_PAR <- mycalcRMS(uniteCov6_G1_woSexAndUnknowChrOVERLAP, fullMetadata_PAR)
fullMetadata_OFFS <- mycalcRMS(uniteCov14_G2_woSexAndUnknowChrOVERLAP, fullMetadata_OFFS)
We kept in total 135 samples. On average, 11.27 [+/- 0.33] million reads were sequenced. The average mapping efficiency was 85% [+/- 0.48%].
The mean coverage per CpG site in the full dataset, considering positions covered in all fish, is 83 [+/- 2.21].
For G1 only: We kept in total 24 samples. On average, 11.16 [+/- 0.67] million reads were sequenced. The average mapping efficiency was 84% [+/- 1.39%].
The mean coverage per CpG site in G1, considering positions shared by at least 6 fish per treatment group (half individuals per group), and which overlap with positions retained in G2, is 46 [+/- 1.71].
For G2 only: We kept in total 111 samples. On average, 11.29 [+/- 0.38] million reads were sequenced. The average mapping efficiency was 86% [+/- 0.47%].
The mean coverage per CpG site in G2, considering positions shared by at least 14 fish per treatment group (half individuals per group), and which overlap with positions retained in G1, is 47 [+/- 0.76].
cor.test(fullMetadata_PAR$Nbr_coveredCpG,
fullMetadata_PAR$Nbr_methCpG, method = "spearman")
##
## Spearman's rank correlation rho
##
## data: fullMetadata_PAR$Nbr_coveredCpG and fullMetadata_PAR$Nbr_methCpG
## S = 350, p-value = 2.153e-06
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.8478261
## S = 350, p-value = 2.15e-06, rho = 0.85
ggplot(fullMetadata_PAR, aes(x=Nbr_coveredCpG, y=Nbr_methCpG))+
geom_smooth(method = "lm", col="black")+
geom_point(aes(col=trtG1G2), size = 3)+ scale_color_manual(values = c("grey", "red")) +
theme_bw() + ggtitle(label = "Parents, CpG shared by half fish/trt")
## Check after RMS correction for coverage bias: CORRECTED (p-value = 0.4485)
cor.test(fullMetadata_PAR$Nbr_coveredCpG,
fullMetadata_PAR$RMS_coveredCpG, method = "spearman")
##
## Spearman's rank correlation rho
##
## data: fullMetadata_PAR$Nbr_coveredCpG and fullMetadata_PAR$RMS_coveredCpG
## S = 2712, p-value = 0.4006
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## -0.1791304
ggplot(fullMetadata_PAR, aes(x=Nbr_coveredCpG, y=RMS_coveredCpG))+
geom_smooth(method = "lm", col="black")+
geom_point(aes(col=trtG1G2), size = 3)+ scale_color_manual(values = c("grey", "red")) +
theme_bw() + ggtitle(label = "Parents, CpG shared by half fish/trt")
## and with residuals: COMPLETELY CORRECTED p-value = 0.9562
cor.test(fullMetadata_PAR$Nbr_coveredCpG,
fullMetadata_PAR$res_Nbr_methCpG_Nbr_coveredCpG, method = "spearman")
##
## Spearman's rank correlation rho
##
## data: fullMetadata_PAR$Nbr_coveredCpG and fullMetadata_PAR$res_Nbr_methCpG_Nbr_coveredCpG
## S = 2308, p-value = 0.9886
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## -0.003478261
ggplot(fullMetadata_PAR, aes(x=Nbr_coveredCpG, y=res_Nbr_methCpG_Nbr_coveredCpG))+
geom_smooth(method = "lm", col="black")+
geom_point(aes(col=trtG1G2), size = 3)+ scale_color_manual(values = c("grey", "red")) +
theme_bw() + ggtitle(label = "Parents, CpG shared by half fish/trt")
############
## Offspring:
cor.test(fullMetadata_OFFS$Nbr_coveredCpG,
fullMetadata_OFFS$Nbr_methCpG, method = "spearman")
##
## Spearman's rank correlation rho
##
## data: fullMetadata_OFFS$Nbr_coveredCpG and fullMetadata_OFFS$Nbr_methCpG
## S = 20254, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.9111355
## S = 20254, p-value < 2.2e-16 rho = 0.91
ggplot(fullMetadata_OFFS, aes(x=Nbr_coveredCpG, y=Nbr_methCpG))+
geom_smooth(method = "lm", col="black")+
geom_point(aes(col=trtG1G2), size = 3)+ scale_color_manual(values = colOffs) +
scale_x_continuous("Number of cytosines covered") +
scale_y_continuous("Number of methylated cytosines") +
theme_bw() + ggtitle(label = "Offspring, CpG shared by half fish/trt")
## Plot distance to residuals:
fit <- lm(Nbr_methCpG ~ Nbr_coveredCpG, data = fullMetadata_OFFS)
plotdf <- fullMetadata_OFFS
plotdf$predicted <- predict(fit) # Save the predicted values
plotdf$residuals <- residuals(fit)
ggplot(plotdf, aes(x=Nbr_coveredCpG, y=Nbr_methCpG))+
geom_smooth(method = "lm", col="black")+
geom_segment(aes(xend = Nbr_coveredCpG, yend = predicted), col = "grey") +
geom_point(aes(col=trtG1G2), size = 3)+ scale_color_manual(values = colOffs) +
scale_x_continuous("Number of cytosines covered") +
scale_y_continuous("Number of methylated cytosines") +
theme_bw() + ggtitle(label = "Offspring, CpG shared by half fish/trt")
## Check after RMS correction for coverage bias: SEMI CORRECTED (p-value = 0.01, rho = -0.24)
cor.test(fullMetadata_OFFS$Nbr_coveredCpG,
fullMetadata_OFFS$RMS_coveredCpG, method = "spearman")
##
## Spearman's rank correlation rho
##
## data: fullMetadata_OFFS$Nbr_coveredCpG and fullMetadata_OFFS$RMS_coveredCpG
## S = 282466, p-value = 0.01158
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## -0.2393208
ggplot(fullMetadata_OFFS, aes(x=Nbr_coveredCpG, y=RMS_coveredCpG))+
geom_point(aes(col=trtG1G2), size = 3)+ scale_color_manual(values = colOffs) +
geom_smooth(method = "lm", col="black")+
theme_bw() + ggtitle(label = "Offspring, CpG shared by half fish/trt")
## and with residuals: COMPLETELY CORRECTED p-value = 0.51
cor.test(fullMetadata_OFFS$Nbr_coveredCpG,
fullMetadata_OFFS$res_Nbr_methCpG_Nbr_coveredCpG, method = "spearman")
##
## Spearman's rank correlation rho
##
## data: fullMetadata_OFFS$Nbr_coveredCpG and fullMetadata_OFFS$res_Nbr_methCpG_Nbr_coveredCpG
## S = 213674, p-value = 0.514
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.06250439
ggplot(fullMetadata_OFFS, aes(x=Nbr_coveredCpG, y=res_Nbr_methCpG_Nbr_coveredCpG))+
geom_point(aes(col=trtG1G2), size = 3)+ scale_color_manual(values = colOffs) +
geom_smooth(method = "lm", col="black")+
scale_x_continuous("Number of cytosines covered") +
scale_y_continuous("Residuals of number of methylated cytosines\n on number of cytosines covered") +
theme_bw() + ggtitle(label = "Offspring, CpG shared by half fish/trt")
mod = lm(MappingEfficiency.BSBoldvsGynogen ~ Sex, data = fullMetadata_OFFS)
summary(step(mod))
## Start: AIC=203.17
## MappingEfficiency.BSBoldvsGynogen ~ Sex
##
## Df Sum of Sq RSS AIC
## <none> 667.74 203.18
## - Sex 1 22.563 690.30 204.86
##
## Call:
## lm(formula = MappingEfficiency.BSBoldvsGynogen ~ Sex, data = fullMetadata_OFFS)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.9847 -1.5030 0.3133 2.0418 4.0823
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 86.3134 0.3337 258.624 <2e-16 ***
## SexM -0.9017 0.4699 -1.919 0.0576 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.475 on 109 degrees of freedom
## Multiple R-squared: 0.03269, Adjusted R-squared: 0.02381
## F-statistic: 3.683 on 1 and 109 DF, p-value: 0.05758
plot(ggpredict(mod, terms = c("Sex")), add.data = T)
NB: this is WITH unknown and sex chromosomes, before filtering.
mod = lm(M.Seqs_rawReads ~ Sex, data = fullMetadata_OFFS)
summary(step(mod))
## Start: AIC=158.08
## M.Seqs_rawReads ~ Sex
##
## Df Sum of Sq RSS AIC
## - Sex 1 3.8569 448.66 157.04
## <none> 444.80 158.08
##
## Step: AIC=157.04
## M.Seqs_rawReads ~ 1
##
## Call:
## lm(formula = M.Seqs_rawReads ~ 1, data = fullMetadata_OFFS)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.4901 -1.1901 -0.2901 0.8599 8.8099
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.2901 0.1917 58.9 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.02 on 110 degrees of freedom
plot(ggpredict(mod, terms = c("Sex")), add.data = T)
NB: this is WITH unknown and sex chromosomes, before filtering.
mod = lm(MeanCoverage ~ Sex, data = fullMetadata_OFFS)
summary(step(mod))
## Start: AIC=314.87
## MeanCoverage ~ Sex
##
## Df Sum of Sq RSS AIC
## - Sex 1 4.4425 1831.0 313.14
## <none> 1826.5 314.87
##
## Step: AIC=313.14
## MeanCoverage ~ 1
##
## Call:
## lm(formula = MeanCoverage ~ 1, data = fullMetadata_OFFS)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.8662 -2.4123 -0.6565 1.7138 15.0594
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 46.8858 0.3872 121.1 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.08 on 110 degrees of freedom
plot(ggpredict(mod, terms = c("Sex")), add.data = T)
NB: this is in G2, considering positions shared by at least 14 fish per treatment group (half individuals per group), and which overlap with positions retained in G1, without sex and unknown chromosome (after filtering)
mod = lm(Nbr_coveredCpG ~ Sex, data = fullMetadata_OFFS)
summary(step(mod))
## Start: AIC=2579.96
## Nbr_coveredCpG ~ Sex
##
## Df Sum of Sq RSS AIC
## - Sex 1 1.9317e+10 1.3495e+12 2579.6
## <none> 1.3302e+12 2580.0
##
## Step: AIC=2579.56
## Nbr_coveredCpG ~ 1
##
## Call:
## lm(formula = Nbr_coveredCpG ~ 1, data = fullMetadata_OFFS)
##
## Residuals:
## Min 1Q Median 3Q Max
## -338266 -58536 26350 82661 139983
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 853897 10513 81.22 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 110800 on 110 degrees of freedom
plot(ggpredict(mod, terms = c("Sex")), add.data = T)
NB: this
is in G2, considering positions shared by at least 14 fish per treatment
group (half individuals per group), and which overlap with positions
retained in G1, without sex and unknown chromosome (after filtering)
mod = lm(OverallPercentageMethylation ~ Sex, data = fullMetadata_OFFS)
summary(step(mod))
## Start: AIC=154.63
## OverallPercentageMethylation ~ Sex
##
## Df Sum of Sq RSS AIC
## <none> 431.18 154.63
## - Sex 1 12.428 443.61 155.78
##
## Call:
## lm(formula = OverallPercentageMethylation ~ Sex, data = fullMetadata_OFFS)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.9467 -1.2674 -0.4101 0.6060 10.3697
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 56.1035 0.2682 209.196 <2e-16 ***
## SexM -0.6692 0.3776 -1.772 0.0791 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.989 on 109 degrees of freedom
## Multiple R-squared: 0.02801, Adjusted R-squared: 0.0191
## F-statistic: 3.142 on 1 and 109 DF, p-value: 0.07911
plot(ggpredict(mod, terms = c("Sex")), add.data = T)
NB: this is in G2, considering positions shared by at least 14 fish per treatment group (half individuals per group), and which overlap with positions retained in G1, without sex and unknown chromosome (after filtering)
mod = lm(res_Nbr_methCpG_Nbr_coveredCpG ~ Sex, data = fullMetadata_OFFS)
summary(step(mod)) # sex is significant p = 0.000157 ***
## Start: AIC=2165.93
## res_Nbr_methCpG_Nbr_coveredCpG ~ Sex
##
## Df Sum of Sq RSS AIC
## <none> 3.1917e+10 2165.9
## - Sex 1 4493411296 3.6411e+10 2178.6
##
## Call:
## lm(formula = res_Nbr_methCpG_Nbr_coveredCpG ~ Sex, data = fullMetadata_OFFS)
##
## Residuals:
## Min 1Q Median 3Q Max
## -38952 -10289 -519 10434 45882
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6420 2307 2.782 0.006361 **
## SexM -12726 3248 -3.917 0.000157 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 17110 on 109 degrees of freedom
## Multiple R-squared: 0.1234, Adjusted R-squared: 0.1154
## F-statistic: 15.35 on 1 and 109 DF, p-value: 0.0001565
anova(mod)
## Analysis of Variance Table
##
## Response: res_Nbr_methCpG_Nbr_coveredCpG
## Df Sum Sq Mean Sq F value Pr(>F)
## Sex 1 4.4934e+09 4493411296 15.345 0.0001565 ***
## Residuals 109 3.1917e+10 292820190
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(ggpredict(mod, terms = c("Sex")), add.data = T) +
xlab(NULL)+
ylab("Residuals of N methylated sites on N covered sites") +
ggtitle("Predicted values of global methylation in offspring")
fullMetadata_OFFS$res_Nbr_methCpG_Nbr_coveredCpG_div1000 <- (fullMetadata_OFFS$res_Nbr_methCpG_Nbr_coveredCpG)/1000
mod_Tol.Meth <- lmer(BCI ~ res_Nbr_methCpG_Nbr_coveredCpG_div1000*No.Worms*PAT + (1|brotherPairID)+ (1|Sex),
data=fullMetadata_OFFS, REML = F)
## Model selection:
step(mod_Tol.Meth, reduce.random = F) # Model found: BCI ~ No.Worms + PAT + (1 | brotherPairID) + (1 | Sex) + No.Worms:PAT
## Backward reduced random-effect table:
##
## Eliminated npar logLik AIC LRT Df Pr(>Chisq)
## <none> 11 -606.89 1235.8
## (1 | brotherPairID) 0 10 -606.89 1233.8 0 1 1
## (1 | Sex) 0 10 -606.89 1233.8 0 1 1
##
## Backward reduced fixed-effect table:
## Degrees of freedom method: Satterthwaite
##
## Eliminated Sum Sq Mean Sq
## res_Nbr_methCpG_Nbr_coveredCpG_div1000:No.Worms:PAT 1 2780.8 2780.8
## res_Nbr_methCpG_Nbr_coveredCpG_div1000:PAT 2 2396.3 2396.3
## res_Nbr_methCpG_Nbr_coveredCpG_div1000:No.Worms 3 1829.8 1829.8
## res_Nbr_methCpG_Nbr_coveredCpG_div1000 4 2571.6 2571.6
## No.Worms:PAT 0 20659.8 20659.8
## NumDF DenDF F value Pr(>F)
## res_Nbr_methCpG_Nbr_coveredCpG_div1000:No.Worms:PAT 1 111 0.8465 0.35953
## res_Nbr_methCpG_Nbr_coveredCpG_div1000:PAT 1 111 0.7240 0.39667
## res_Nbr_methCpG_Nbr_coveredCpG_div1000:No.Worms 1 111 0.5492 0.46020
## res_Nbr_methCpG_Nbr_coveredCpG_div1000 1 111 0.7681 0.38270
## No.Worms:PAT 1 111 6.1283 0.01481
##
## res_Nbr_methCpG_Nbr_coveredCpG_div1000:No.Worms:PAT
## res_Nbr_methCpG_Nbr_coveredCpG_div1000:PAT
## res_Nbr_methCpG_Nbr_coveredCpG_div1000:No.Worms
## res_Nbr_methCpG_Nbr_coveredCpG_div1000
## No.Worms:PAT *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Model found:
## BCI ~ No.Worms + PAT + (1 | brotherPairID) + (1 | Sex) + No.Worms:PAT
## The slope of BCI on nbrworms varies upon treatment but methylation does NOT vary with tolerance
mod_Tol.Meth <- lmer(BCI ~ No.Worms*PAT + (1|brotherPairID)+ (1|Sex),
data=fullMetadata_OFFS)
## And by treatment instead of No.worms?
mod_Tol.Meth2 <- lmer(BCI ~ res_Nbr_methCpG_Nbr_coveredCpG_div1000*PAT*outcome + (1|brotherPairID)+ (1|Sex),
data=fullMetadata_OFFS, REML = F)
## Model selection:
step(mod_Tol.Meth2, reduce.random = F) # Model found: BCI ~ PAT + outcome + (1 | brotherPairID) + (1 | Sex) + PAT:outcome
## Backward reduced random-effect table:
##
## Eliminated npar logLik AIC LRT Df Pr(>Chisq)
## <none> 11 -603.06 1228.1
## (1 | brotherPairID) 0 10 -603.06 1226.1 0 1 1
## (1 | Sex) 0 10 -603.06 1226.1 0 1 1
##
## Backward reduced fixed-effect table:
## Degrees of freedom method: Satterthwaite
##
## Eliminated Sum Sq Mean Sq
## res_Nbr_methCpG_Nbr_coveredCpG_div1000:PAT:outcome 1 2156.6 2156.6
## res_Nbr_methCpG_Nbr_coveredCpG_div1000:outcome 2 494.6 494.6
## res_Nbr_methCpG_Nbr_coveredCpG_div1000:PAT 3 1034.5 1034.5
## res_Nbr_methCpG_Nbr_coveredCpG_div1000 4 2542.3 2542.3
## PAT:outcome 0 30432.9 30432.9
## NumDF DenDF F value Pr(>F)
## res_Nbr_methCpG_Nbr_coveredCpG_div1000:PAT:outcome 1 111 0.7034 0.403442
## res_Nbr_methCpG_Nbr_coveredCpG_div1000:outcome 1 111 0.1603 0.689644
## res_Nbr_methCpG_Nbr_coveredCpG_div1000:PAT 1 111 0.3348 0.564008
## res_Nbr_methCpG_Nbr_coveredCpG_div1000 1 111 0.8203 0.367046
## PAT:outcome 1 111 9.7478 0.002289
##
## res_Nbr_methCpG_Nbr_coveredCpG_div1000:PAT:outcome
## res_Nbr_methCpG_Nbr_coveredCpG_div1000:outcome
## res_Nbr_methCpG_Nbr_coveredCpG_div1000:PAT
## res_Nbr_methCpG_Nbr_coveredCpG_div1000
## PAT:outcome **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Model found:
## BCI ~ PAT + outcome + (1 | brotherPairID) + (1 | Sex) + PAT:outcome
The slope of BCI on nbr worms varies upon parental treatment, but methylation does NOT vary with tolerance
## By group, tolerance slope as a function of methylation residuals:
modFULL <- lmer(BCI ~ res_Nbr_methCpG_Nbr_coveredCpG_div1000*No.Worms + (1|brotherPairID) + (1|Sex),
data = fullMetadata_OFFS[fullMetadata_OFFS$trtG1G2 %in% c("NE_exposed", "E_exposed"),])
## Model selection:
step(modFULL, reduce.random = F) # Model found: BCI ~ (1 | brotherPairID) + (1 | Sex)
## Backward reduced random-effect table:
##
## Eliminated npar logLik AIC LRT Df Pr(>Chisq)
## <none> 7 -293.63 601.26
## (1 | brotherPairID) 0 6 -293.63 599.26 0.000000 1 1.0000
## (1 | Sex) 0 6 -293.66 599.32 0.067386 1 0.7952
##
## Backward reduced fixed-effect table:
## Degrees of freedom method: Satterthwaite
##
## Eliminated Sum Sq Mean Sq NumDF
## res_Nbr_methCpG_Nbr_coveredCpG_div1000:No.Worms 1 362.1 362.1 1
## No.Worms 2 577.4 577.4 1
## res_Nbr_methCpG_Nbr_coveredCpG_div1000 3 8726.5 8726.5 1
## DenDF F value Pr(>F)
## res_Nbr_methCpG_Nbr_coveredCpG_div1000:No.Worms 49.047 0.1098 0.7418
## No.Worms 51.973 0.1776 0.6752
## res_Nbr_methCpG_Nbr_coveredCpG_div1000 37.954 2.7327 0.1066
##
## Model found:
## BCI ~ (1 | brotherPairID) + (1 | Sex)
modFULL <- lmer(BCI ~ res_Nbr_methCpG_Nbr_coveredCpG_div1000*PAT + (1|brotherPairID) + (1|Sex),
data = fullMetadata_OFFS[fullMetadata_OFFS$trtG1G2 %in% c("NE_exposed", "E_exposed"),])
## Model selection:
step(modFULL, reduce.random = F) # Model found: BCI ~ PAT + (1 | brotherPairID) + (1 | Sex)
## Backward reduced random-effect table:
##
## Eliminated npar logLik AIC LRT Df Pr(>Chisq)
## <none> 7 -278.14 570.27
## (1 | brotherPairID) 0 6 -278.14 568.27 0.0000 1 1.0000
## (1 | Sex) 0 6 -278.93 569.86 1.5904 1 0.2073
##
## Backward reduced fixed-effect table:
## Degrees of freedom method: Satterthwaite
##
## Eliminated Sum Sq Mean Sq NumDF
## res_Nbr_methCpG_Nbr_coveredCpG_div1000:PAT 1 504 504 1
## res_Nbr_methCpG_Nbr_coveredCpG_div1000 2 1476 1476 1
## PAT 0 76908 76908 1
## DenDF F value Pr(>F)
## res_Nbr_methCpG_Nbr_coveredCpG_div1000:PAT 50.451 0.2624 0.6107
## res_Nbr_methCpG_Nbr_coveredCpG_div1000 51.749 0.7782 0.3818
## PAT 47.592 41.0412 6.195e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Model found:
## BCI ~ PAT + (1 | brotherPairID) + (1 | Sex)
# 1. get raw values
percmeth = percMethylation(uniteCov14_G2_woSexAndUnknowChrOVERLAP)
# Run PCA on complete data (CpG covered in all fish)
PCA_allpos <- myPCA(x = t(na.omit(percmeth)), incomplete = FALSE)
## [1] "The chosen model is:"
## BCI ~ No.Worms + PAT + (1 | brotherPairID) + (1 | Sex) + No.Worms:PAT
## <environment: 0x5595c0be2320>
We perform a PCA on 78384 CpG sites covered in all G2 individuals. We first perform a test on the complete dataset.
fviz_pca_ind(PCA_allpos$res.PCA, label="none", habillage=PCA_allpos$metadata$trtG1G2) +
scale_color_manual(values = colOffs)+
scale_shape_manual(values=c(19,19,19,19))
fviz_pca_ind(PCA_allpos$res.PCA, label="none", habillage=as.factor(PCA_allpos$metadata$brotherPairID))
# The function dimdesc() can be used to identify the most correlated variables with a given principal component.
mydimdesc <- dimdesc(PCA_allpos$res.PCA, axes = c(1,2), proba = 0.05)
There are 36212 CpG sites most correlated (p < 0.05) with the first principal component , and 15101 with the second principal component.
The 2 first PCA axes do not explain BCI (p<0.05)
# Percentage of variance explained by each factor:
formula(PCA_allpos$modSel) # BCI ~ No.Worms + PAT + (1 | brotherPairID) + (1 | Sex) + No.Worms:PAT
## BCI ~ No.Worms + PAT + (1 | brotherPairID) + (1 | Sex) + No.Worms:PAT
## <environment: 0x5595c0be2320>
mod_noworms = lmer(BCI ~ PAT + (1 | brotherPairID) + (1 | Sex), data = PCA_allpos$metadata)
mod_noPAT = lmer(BCI ~ No.Worms + (1 | brotherPairID) + (1 | Sex), data = PCA_allpos$metadata)
# R2c conditional R2 value associated with fixed effects plus the random effects.
A = (MuMIn::r.squaredGLMM(PCA_allpos$modSel)[2] -
MuMIn::r.squaredGLMM(mod_noworms)[2])*100
B = (MuMIn::r.squaredGLMM(PCA_allpos$modSel)[2] -
MuMIn::r.squaredGLMM(mod_noPAT)[2])*100
# Set up scatterplot
scatterplot <- ggplot(fullMetadata_OFFS,
aes(x = res_Nbr_methCpG_Nbr_coveredCpG,
y = BCI, fill=trtG1G2)) +
geom_point(pch=21, size =3, alpha = .8) +
guides(color = "none") +
scale_fill_manual(values = colOffs, name = "Treatment",
labels = c("G1 control - G2 control", "G1 control - G2 exposed", "G1 exposed - G2 control", "G1 exposed - G2 exposed")) +
theme(plot.margin = margin()) + theme_bw() +
theme(legend.position = "none") +
xlab("Methylation residuals (methylated sites/coverage")+
ylab("Body Condition Index")
# Define marginal histogram
marginal_distribution <- function(x, var, group) {
ggplot(x, aes_string(x = var, fill = group)) +
# geom_histogram(bins = 30, alpha = 0.4, position = "identity") +
geom_density(alpha = 0.6, size = 0.2) +
guides(fill = "none") +
scale_fill_manual(values = colOffs) +
theme_void() +
theme(plot.margin = margin())
}
# Set up marginal histograms
x_hist <- marginal_distribution(fullMetadata_OFFS, "res_Nbr_methCpG_Nbr_coveredCpG", "trtG1G2")
y_hist <- marginal_distribution(fullMetadata_OFFS, "BCI", "trtG1G2") +
coord_flip()
# Align histograms with scatterplot
aligned_x_hist <- align_plots(x_hist, scatterplot, align = "v")[[1]]
aligned_y_hist <- align_plots(y_hist, scatterplot, align = "h")[[1]]
# Arrange plots
cowplot::plot_grid(
aligned_x_hist, NULL, scatterplot, aligned_y_hist, ncol = 2, nrow = 2, rel_heights = c(0.2, 1), rel_widths = c(1, 0.2)
)
Methylation profiles, CpG present in all fish
Methylation profile for the 55530 CpG sites covered in all G1 & G2 fish (N = 135:
makePrettyMethCluster(uniteCovALL_woSexAndUnknowChr, fullMetadata,
my.cols.trt=c("#333333ff","#ff0000ff","#ffe680ff","#ff6600ff","#aaccffff","#aa00d4ff"),
my.cols.fam = c(1:4), nbrk = 8)
Methylation profile for the 78384 CpG sites covered in all G2 fish (N = 111:
makePrettyMethCluster(uniteCovALL_G2_woSexAndUnknowChr, fullMetadata_OFFS,
my.cols.trt=c("#ffe680ff","#ff6600ff", "#aaccffff", "#aa00d4ff"),
my.cols.fam = c(1:4), nbrk = 8)
Permutational multivariate analysis of variance (PERMANOVA) is a non-parametric multivariate statistical test. It is used to compare groups of objects and test the null hypothesis that the centroids and dispersion of the groups as defined by measure space are equivalent for all groups.
# make distance matrix with B-C distances
data.dist = makeDatadistFUN(uniteCovALL_G2_woSexAndUnknowChr)
## Adonis test: importance of each predictor
adonis2(data.dist ~ PAT * outcome * Sex * brotherPairID, data = fullMetadata_OFFS)
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Permutation: free
## Number of permutations: 999
##
## adonis2(formula = data.dist ~ PAT * outcome * Sex * brotherPairID, data = fullMetadata_OFFS)
## Df SumOfSqs R2 F Pr(>F)
## PAT 1 0.002782 0.01470 1.8292 0.001 ***
## outcome 1 0.001909 0.01009 1.2552 0.022 *
## Sex 1 0.002418 0.01277 1.5895 0.001 ***
## brotherPairID 7 0.028018 0.14805 2.6316 0.001 ***
## PAT:outcome 1 0.001680 0.00888 1.1045 0.126
## PAT:Sex 1 0.001492 0.00789 0.9813 0.485
## outcome:Sex 1 0.001557 0.00823 1.0240 0.348
## PAT:brotherPairID 7 0.014344 0.07580 1.3473 0.001 ***
## outcome:brotherPairID 7 0.010591 0.05596 0.9947 0.506
## Sex:brotherPairID 7 0.010394 0.05492 0.9763 0.666
## PAT:outcome:Sex 1 0.001453 0.00768 0.9555 0.637
## PAT:outcome:brotherPairID 7 0.010351 0.05470 0.9722 0.727
## PAT:Sex:brotherPairID 7 0.010249 0.05415 0.9626 0.738
## outcome:Sex:brotherPairID 6 0.008749 0.04623 0.9587 0.747
## PAT:outcome:Sex:brotherPairID 1 0.001127 0.00596 0.7413 0.997
## Residual 54 0.082132 0.43399
## Total 110 0.189247 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Results: family of the father (brotherPairID) explains more than 14% of the variance in methylation.
To focus on G1 and G2 treatments, we define the permutation structure considering brother pairs (N = 8), and use a PERMANOVA to test the hypothesis that paternal treatment, offspring treatment and their interactions significantly influencing global methylation.
perm <- how(nperm = 1000) # 1000 permutations
setBlocks(perm) <- with(fullMetadata_OFFS, brotherPairID) # define the permutation structure considering brotherPairID and sex
print(adonis2(data.dist ~ PAT * outcome * Sex, data = fullMetadata_OFFS, permutations = perm))
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Blocks: with(fullMetadata_OFFS, brotherPairID)
## Permutation: free
## Number of permutations: 1000
##
## adonis2(formula = data.dist ~ PAT * outcome * Sex, data = fullMetadata_OFFS, permutations = perm)
## Df SumOfSqs R2 F Pr(>F)
## PAT 1 0.002782 0.01470 1.6344 0.000999 ***
## outcome 1 0.001909 0.01009 1.1216 0.050949 .
## Sex 1 0.002418 0.01277 1.4203 0.008991 **
## PAT:outcome 1 0.001716 0.00907 1.0080 0.093906 .
## PAT:Sex 1 0.001740 0.00920 1.0224 0.181818
## outcome:Sex 1 0.001703 0.00900 1.0004 0.313686
## PAT:outcome:Sex 1 0.001653 0.00874 0.9712 0.343656
## Residual 103 0.175326 0.92644
## Total 110 0.189247 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Results:
#### RUN Goodness of fit
myGOF.NMDS.FUN(dataset = uniteCovALL_G2_woSexAndUnknowChr)
Goodness of fit for NMDS suggested the presence of six dimensions with a
stress value > 0.1 and 2 with > 0.2
## to find the seed that allows convergence:
# sapply(3:10, function(x) myNMDS(dataset = uniteCovALL_G2_woSexAndUnknowChr, metadata = fullMetadata_OFFS, myseed = x))
NMDSanalysis <- myNMDSFUN(dataset = uniteCovALL_G2_woSexAndUnknowChr, metadata = fullMetadata_OFFS, myseed = 4)
# png(filename = "Rfigures/NMDSplot_allG2.png", width = 900, height = 1100)
NMDSanalysis$NMDSplot
# dev.off()
AdonisWithinG1trtFUN(trtgp = c(2,3))
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Blocks: with(fullMetadata_OFFS[fullMetadata_OFFS$trtG1G2_NUM %in% trtgp,
## Permutation: free
## Number of permutations: 1000
##
## adonis2(formula = data.dist ~ outcome + Sex + brotherPairID, data = fullMetadata_OFFS[fullMetadata_OFFS$trtG1G2_NUM %in% trtgp, ], permutations = perm)
## Model: adonis0(formula = ~outcome + Sex + brotherPairID, data = data, method = method)
## Df SumOfSqs R2 F Pr(>F)
## outcome 1 0.001475 0.01622 1.0073 0.486513
## Sex 1 0.002094 0.02302 1.4301 0.000999 ***
## brotherPairID 7 0.020026 0.22018 1.9537 0.001998 **
## Residual 46 0.067357 0.74058
## Total 55 0.090952 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Results:
NMDSanalysis_G1control <- myNMDSFUN(dataset = uniteCovALL_G2_woSexAndUnknowChr,
metadata = fullMetadata_OFFS, myseed = 25,
byParentTrt=TRUE,
trtgp = c(2,3))
#png(filename = "Rfigures/NMDSplot_G1fromControlG2.png", width = 900, height = 900)
NMDSanalysis_G1control$NMDSplot
#dev.off()
AdonisWithinG1trtFUN(trtgp = c(5,6))
## Permutation test for adonis under reduced model
## Terms added sequentially (first to last)
## Blocks: with(fullMetadata_OFFS[fullMetadata_OFFS$trtG1G2_NUM %in% trtgp,
## Permutation: free
## Number of permutations: 1000
##
## adonis2(formula = data.dist ~ outcome + Sex + brotherPairID, data = fullMetadata_OFFS[fullMetadata_OFFS$trtG1G2_NUM %in% trtgp, ], permutations = perm)
## Model: adonis0(formula = ~outcome + Sex + brotherPairID, data = data, method = method)
## Df SumOfSqs R2 F Pr(>F)
## outcome 1 0.002160 0.02262 1.4047 0.006993 **
## Sex 1 0.002053 0.02150 1.3349 0.192807
## brotherPairID 7 0.022076 0.23117 2.0507 0.019980 *
## Residual 45 0.069205 0.72470
## Total 54 0.095494 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Results:
NMDSanalysis_G1infected <- myNMDSFUN(dataset = uniteCovALL_G2_woSexAndUnknowChr,
metadata = fullMetadata_OFFS, myseed = 25,
byParentTrt=TRUE,
trtgp = c(5,6))
#png(filename = "Rfigures/NMDSplot_G1fromInfectedG2.png", width = 900, height = 900)
NMDSanalysis_G1infected$NMDSplot
#dev.off()
Differential methylation by brother pair (sex as covariate):
## All in DMSBPlist
## Extract DMS (by position)
myPosList = lapply(DMSBPlist, lapply, function(x){paste(x$chr, x$end)})
## Find DMS present in at least 4 BP out of 8 (half):
get2keep = function(Compa, NBP = 4){
x <- lapply(myPosList, function(x){unlist(x[[paste0("DMS_15pc_BP_", Compa)]])})
f <- table(unlist((x))) # each DMS present between 1 and 8 times
tokeep <- names(f)[f >= NBP]
# print(length(tokeep))
## Keep the DMS present in 4 families minimum
DMSBPlist_INTER4 <- lapply(x, function(x){x[x %in% tokeep]})
## Reorder by family:
DMSBPlist_INTER4 <- DMSBPlist_INTER4[names(DMSBPlist_INTER4)[order(names(DMSBPlist_INTER4))]]
return(DMSBPlist_INTER4)
}
## Prepare df for complexUpset
getUpsetDF = function(i, NBP){ # for a given comparison
A = get2keep(vecCompa[i], NBP)
A2 = lapply(A, function(x){
x = data.frame(x) # vector of DMS as df
names(x) = "DMS" # name each CpG
return(x)
})
## Add BP name
for (i in 1:length(names(A2))){
A2[[i]]["BP"] = names(A2)[i]
}
# make a dataframe
A2 = A2 %>% reduce(full_join, by = "DMS")
# names column with BP id
for (i in 2:ncol(A2)) {names(A2)[i] = unique(A2[!is.na(A2[i]), i])}
# replace by 0 or 1 the DMS absence/presence
A = data.frame(apply(A2[2:9], 2, function(x) ifelse(is.na(x), 0, 1)))
# add DMS
A$DMS = A2$DMS
return(A)
}
## Vector of all 4 comparisons
vecCompa <- c("CC_TC", "CT_TT", "CC_CT", "TC_TT")
vecCompaVerbose <- c("Control offspring in control vs infected parents", "Infected offspring in control vs infected parents", "Control vs infected offspring from control parent", "Control vs infected offspring from infected parent")
Make upset plots for DMS found in at least 4 BP:
for (i in 1:4){
df = getUpsetDF(i, NBP = 4)
print(ComplexUpset::upset(
df,
names(df)[1:8],
width_ratio=0.1,
themes=upset_default_themes(text=element_text(size=25)),
sort_intersections_by=c('degree', 'cardinality'),
queries=query_by_degree(
df, names(df)[1:8],
params_by_degree=list(
'1'=list(color='red', fill='red'),
'2'=list(color='purple', fill='purple'),
'3'=list(color='blue', fill='blue'),
'4'=list(color='grey', fill='grey'),
'5'=list(color='red', fill='red'),
'6'=list(color='purple', fill='purple'),
'7'=list(color='blue', fill='blue'),
'8'=list(color='green', fill='green')
),
only_components=c("intersections_matrix", "Intersection size")
)) + ggtitle(paste0("Differentially methylated sites found in more than four brother pairs in the comparison: \n", vecCompaVerbose[i])) + theme(plot.title = element_text(size = 30)))
}
Make upset plots for DMS found in at least 6 BP:
for (i in 1:4){
df = getUpsetDF(i, NBP = 6)
print(ComplexUpset::upset(
df,
names(df)[1:8],
width_ratio=0.1,
themes=upset_default_themes(text=element_text(size=15)),
sort_intersections_by=c('degree', 'cardinality'),
queries=query_by_degree(
df, names(df)[1:8],
params_by_degree=list(
'1'=list(color='red', fill='red'),
'2'=list(color='purple', fill='purple'),
'3'=list(color='blue', fill='blue'),
'4'=list(color='grey', fill='grey'),
'5'=list(color='red', fill='red'),
'6'=list(color='purple', fill='purple'),
'7'=list(color='blue', fill='blue'),
'8'=list(color='green', fill='green')
),
only_components=c("intersections_matrix", "Intersection size")
)) + ggtitle(paste0("Differentially methylated sites found in more than six brother pairs in the comparison: \n", vecCompaVerbose[i]))) #+ theme(plot.title = element_text(size = 30)))
}
plotManhattanGenes <- function(i){
# print(paste0("The number of DMS in the ", vecCompa[i] ," comparison is:"))
DMSvec = unique(unlist(get2keep(vecCompa[i])))
# Annotate the DMS present in at least 4 BP:
# Change the vector into a methobject:
df <- data.frame(chr=sapply(strsplit(DMSvec, " "), `[`, 1),
start=sapply(strsplit(DMSvec, " "), `[`, 2),
end=sapply(strsplit(DMSvec, " "), `[`, 2))
# get annotation
anot4BP <- getAnnotationFun(makeGRangesFromDataFrame(df))
# Add info: in how many Brother pairs is a differential methylation found in a gene?
anotBPs = data.frame()
for (j in 4:8){
vec = names(table(unlist(get2keep(vecCompa[i])))[table(unlist(get2keep(vecCompa[i]))) %in% j])
if(!is_empty(vec)){ # in case of 8 BP, empty in some comparisons
df = data.frame(chr=sapply(strsplit(vec, " "), `[`, 1),
start=sapply(strsplit(vec, " "), `[`, 2),
end=sapply(strsplit(vec, " "), `[`, 2))
anot = getAnnotationFun(makeGRangesFromDataFrame(df))
anot$N_BP = j
anotBPs = rbind(anotBPs, anot)
}
}
# keep the highest N of BP (how many BP have at least one site DM in the gene)
anotBPs = anotBPs %>%
group_by(ID) %>%
dplyr::slice(which.max(N_BP)) %>% dplyr::select(c(ID, N_BP))
# Ad N_BP info
anot4BP = merge(anot4BP, anotBPs)
# Reorder chromosome
anot4BP <- anot4BP %>%
mutate(chr = gsub("Gy_chr", "", seqid), chrom_nr = seqid %>% deroman(), chrom_order=factor(chrom_nr) %>%
as.numeric()) %>% arrange(chrom_order) %>%
mutate(geneLengthkb = (end - start)/1000, nCpGperGenekb = nCpG/geneLengthkb)
# Plot
plot = ggplot(anot4BP, aes(x=start, y = nCpGperGenekb)) + geom_point(aes(col=as.factor(N_BP)), size = 2) +
scale_color_manual(values = c('grey', 'red', 'purple', 'blue', 'green'),
name = "Gene found differentially methylated in N brother pairs:") +
facet_grid(.~fct_inorder(chr)) +
geom_hline(yintercept = 1)+
theme(axis.text.x=element_blank(), legend.position="top") +
xlab(paste0("Genes with DMS present in at least 4 brother pairs\nComparison: ", vecCompa[i])) +
ylab("Number of differentially methylated CpG per gene kb")
## Genes with at least 1 CpG differentially methylated per kb:
topGenes = anot4BP[anot4BP$nCpGperGenekb >= 1,]
return(list(plot = plot, anot4BP = anot4BP, topGenes = topGenes))
}
plotManhattanGenes(1)$plot
plotManhattanGenes(2)$plot
plotManhattanGenes(3)$plot
plotManhattanGenes(4)$plot
# create gene universe
gene_universe <- data.frame(
subsetByOverlaps(GRanges(annotGff3), GRanges(uniteCov14_G2_woSexAndUnknowChrOVERLAP))) %>% # subselect covered CpGs
filter(lengths(Ontology_term)!=0) %>% # rm non existing GO terms
filter(type %in% "gene") %>% # keep all the 7416 genes with GO terms
dplyr::select(c("Name", "Ontology_term")) %>%
mutate(go_linkage_type = "IEA") %>% #NB: IEA but not necessarily true, it's from Interproscan after Maker. Sticklebacks (biomart) have 82701 IEA and 63 ISS.
relocate("Ontology_term","go_linkage_type","Name") %>%
unnest(Ontology_term) %>% # one GO per line (was a list before in this column)
data.frame()
# Create gene set collection
goFrame <- GOFrame(gene_universe, organism="Gasterosteus aculeatus")
goAllFrame <- GOAllFrame(goFrame)
gsc_universe <- GeneSetCollection(goAllFrame, setType = GOCollection())
IMPORTANT NOTE from Mel: why conditional hypergeometric test? The GO ontology is set up as a directed acyclic graph, where a parent term is comprised of all its child terms. If you do a standard hypergeometric, you might e.g., find ‘positive regulation of kinase activity’ to be significant. If you then test ‘positive regulation of catalytic activity’, which is a parent term, then it might be significant as well, but only because of the terms coming from positive regulation of kinase activity.
The conditional hypergeometric takes this into account, and only uses those terms that were not already significant when testing a higher order (parent) term.
## For each comparison:
A=makedfGO(1, gene_universe); B=makedfGO(2, gene_universe); C=makedfGO(3, gene_universe); D=makedfGO(4, gene_universe)
dfGO = rbind(A, B, C, D)
# add type of comparison:
dfGO$group = "Different parental treatment"
dfGO$group[dfGO$Comp %in% c("CC_CT", "TC_TT")] = "Different offpring treatment"
dfGO %>% ggplot(aes(x=Comp, y = factor(GO.name))) +
geom_point(aes(color = p.value.adjusted, size = genePercent)) +
scale_color_gradient(name="adjusted\np-value", low = "red", high = "blue") +
scale_size_continuous(name = "% of genes")+
theme_bw() + ylab("") + xlab("Treatments comparison") +
theme(legend.box.background = element_rect(fill = "#ebebeb", color = "#ebebeb"),
legend.background = element_rect(fill = "#ebebeb", color = "#ebebeb"),
legend.key = element_rect(fill = "#ebebeb", color = "#ebebeb"), legend.position="left") + # grey box for legend
facet_grid(fct_inorder(GO.category)~group, scales="free",space = "free")
getGeneSummary <- function(i){
topgenes = plotManhattanGenes(i)$topGenes
df = data.frame(GeneSymbol = sapply(topgenes$Note, function(x) sub("Similar to ", "", x) %>% str_extract(".*:") %>% str_remove(":") %>% toupper), # extract uniprot symbol from note, then uppercase
N_BP = topgenes$N_BP, seqid = topgenes$seqid) #add info: on how many BP? which chr? how many Cpg ar diffmeth?nCpGperGenekb = topgenes$nCpGperGenekb,
# Convert the uniprot gene names to entrez ids
topGenesDF = unlist(mapIds(org.Hs.eg.db, keys = df$GeneSymbol, column = "ENTREZID", keytype = "SYMBOL")) %>% data.frame()
names(topGenesDF) = "ENTREZID" ; topGenesDF$GeneSymbol = rownames(topGenesDF) ; rownames(topGenesDF) = NULL
# Retrieve gene summary & description
genes = entrez_summary(db="gene", id=topGenesDF$ENTREZID)
SummaDF = lapply(genes, function(x) x[["summary"]]) %>% unlist() %>% data.frame()
names(SummaDF) = "Summary" ; SummaDF$ENTREZID = rownames(SummaDF) ; rownames(SummaDF) = NULL
DescDF = lapply(genes, function(x) x[["description"]]) %>% unlist() %>% data.frame()
names(DescDF) = "description" ; DescDF$ENTREZID = rownames(DescDF) ; rownames(DescDF) = NULL
# Output complete table
topGenesDF = merge(merge(DescDF, topGenesDF, all = T), SummaDF, all = T)
# Add which comparison
topGenesDF$comparison = vecCompa[i]
# Add Number of brother pairs in which the gene is found
topGenesDF = merge(topGenesDF, df)
# order by N_BP
topGenesDF = topGenesDF[order(topGenesDF$N_BP, decreasing = T),]
return(topGenesDF) # NB: some genes are not found automatically
}
| GeneSymbol | ENTREZID | description | Summary | comparison | N_BP | seqid | |
|---|---|---|---|---|---|---|---|
| 1 | BMP2 | 650 | bone morphogenetic protein 2 | This gene encodes a secreted ligand of the TGF-beta (transforming growth factor-beta) superfamily of proteins. Ligands of this family bind various TGF-beta receptors leading to recruitment and activation of SMAD family transcription factors that regulate gene expression. The encoded preproprotein is proteolytically processed to generate each subunit of the disulfide-linked homodimer, which plays a role in bone and cartilage development. Duplication of a regulatory region downstream of this gene causes a form of brachydactyly characterized by a malformed index finger and second toe in human patients. [provided by RefSeq, Jul 2016] | CC_TC | 7 | Gy_chrXVIII |
| 10 | LIM2 | 3982 | lens intrinsic membrane protein 2 | This gene encodes an eye lens-specific protein found at the junctions of lens fiber cells, where it may contribute to cell junctional organization. It acts as a receptor for calmodulin, and may play an important role in both lens development and cataractogenesis. Mutations in this gene have been associated with cataract formation. Alternatively spliced transcript variants encoding different isoforms have been found for this gene.[provided by RefSeq, Sep 2009] | CC_TC | 6 | Gy_chrI |
| 16 | PROKR1 | 10887 | prokineticin receptor 1 | This gene encodes a member of the G-protein-coupled receptor family. The encoded protein binds to prokineticins (1 and 2), leading to the activation of MAPK and STAT signaling pathways. Prokineticins are protein ligands involved in angiogenesis and inflammation. The encoded protein is expressed in peripheral tissues such as those comprising the circulatory system, lungs, reproductive system, endocrine system and the gastrointestinal system. The protein may be involved in signaling in human fetal ovary during initiation of primordial follicle formation. Sequence variants in this gene may be associated with recurrent miscarriage. [provided by RefSeq, Aug 2016] | CC_TC | 6 | Gy_chrVI |
| 3 | CIAO2B | 51647 | cytosolic iron-sulfur assembly component 2B | CC_TC | 5 | Gy_chrVI | |
| 4 | DGKE | 8526 | diacylglycerol kinase epsilon | Diacylglycerol kinases are thought to be involved mainly in the regeneration of phosphatidylinositol (PI) from diacylglycerol in the PI-cycle during cell signal transduction. When expressed in mammalian cells, DGK-epsilon shows specificity for arachidonyl-containing diacylglycerol. DGK-epsilon is expressed predominantly in testis. [provided by RefSeq, Jul 2008] | CC_TC | 5 | Gy_chrIX |
| 6 | GTF2IRD2 | 84163 | GTF2I repeat domain containing 2 | This gene is one of several closely related genes on chromosome 7 encoding proteins containing helix-loop-helix motifs. These proteins may function as regulators of transcription. The encoded protein is unique in that its C-terminus is derived from CHARLIE8 transposable element sequence. This gene is located in a region of chromosome 7 that is deleted in Williams-Beuren syndrome, and loss of this locus may contribute to the cognitive phenotypes observed in this disease. Alternative splicing results in multiple transcript variants. [provided by RefSeq, Jul 2013] | CC_TC | 5 | Gy_chrVIII |
| 7 | HBA2 | 3040 | hemoglobin subunit alpha 2 | The human alpha globin gene cluster located on chromosome 16 spans about 30 kb and includes seven loci: 5’- zeta - pseudozeta - mu - pseudoalpha-1 - alpha-2 - alpha-1 - theta - 3’. The alpha-2 (HBA2) and alpha-1 (HBA1) coding sequences are identical. These genes differ slightly over the 5’ untranslated regions and the introns, but they differ significantly over the 3’ untranslated regions. Two alpha chains plus two beta chains constitute HbA, which in normal adult life comprises about 97% of the total hemoglobin; alpha chains combine with delta chains to constitute HbA-2, which with HbF (fetal hemoglobin) makes up the remaining 3% of adult hemoglobin. Alpha thalassemias result from deletions of each of the alpha genes as well as deletions of both HBA2 and HBA1; some nondeletion alpha thalassemias have also been reported. [provided by RefSeq, Jul 2008] | CC_TC | 5 | Gy_chrV |
| 22 | TENT5A | 55603 | terminal nucleotidyltransferase 5A | CC_TC | 5 | Gy_chrXVIII | |
| 23 | VIM | 7431 | vimentin | This gene encodes a type III intermediate filament protein. Intermediate filaments, along with microtubules and actin microfilaments, make up the cytoskeleton. The encoded protein is responsible for maintaining cell shape and integrity of the cytoplasm, and stabilizing cytoskeletal interactions. This protein is involved in neuritogenesis and cholesterol transport and functions as an organizer of a number of other critical proteins involved in cell attachment, migration, and signaling. Bacterial and viral pathogens have been shown to attach to this protein on the host cell surface. Mutations in this gene are associated with congenital cataracts in human patients. [provided by RefSeq, Aug 2017] | CC_TC | 5 | Gy_chrXX |
| 2 | CDC42EP1 | 11135 | CDC42 effector protein 1 | CDC42 is a member of the Rho GTPase family that regulates multiple cellular activities, including actin polymerization. The protein encoded by this gene is a CDC42 binding protein that mediates actin cytoskeleton reorganization at the plasma membrane. This protein is secreted and is primarily found in bone marrow. [provided by RefSeq, Jul 2008] | CC_TC | 4 | Gy_chrXI |
| 5 | FKBP3 | 2287 | FKBP prolyl isomerase 3 | The protein encoded by this gene is a member of the immunophilin protein family, which play a role in immunoregulation and basic cellular processes involving protein folding and trafficking. This encoded protein is a cis-trans prolyl isomerase that binds the immunosuppressants FK506 and rapamycin, as well as histone deacetylases, the transcription factor YY1, casein kinase II, and nucleolin. It has a higher affinity for rapamycin than for FK506 and thus may be an important target molecule for immunosuppression by rapamycin. [provided by RefSeq, Sep 2008] | CC_TC | 4 | Gy_chrXVIII |
| 8 | KCNJ6 | 3763 | potassium inwardly rectifying channel subfamily J member 6 | This gene encodes a member of the G protein-coupled inwardly-rectifying potassium channel family of inward rectifier potassium channels. This type of potassium channel allows a greater flow of potassium into the cell than out of it. These proteins modulate many physiological processes, including heart rate in cardiac cells and circuit activity in neuronal cells, through G-protein coupled receptor stimulation. Mutations in this gene are associated with Keppen-Lubinsky Syndrome, a rare condition characterized by severe developmental delay, facial dysmorphism, and intellectual disability. [provided by RefSeq, Apr 2015] | CC_TC | 4 | Gy_chrVII |
| 9 | KCNK9 | 51305 | potassium two pore domain channel subfamily K member 9 | This gene encodes a protein that contains multiple transmembrane regions and two pore-forming P domains and functions as a pH-dependent potassium channel. Amplification and overexpression of this gene have been observed in several types of human carcinomas. This gene is imprinted in the brain, with preferential expression from the maternal allele. A mutation in this gene was associated with Birk-Barel dysmorphism syndrome. Alternative splicing results in multiple transcript variants. [provided by RefSeq, Jul 2017] | CC_TC | 4 | Gy_chrX |
| 11 | MAG | 4099 | myelin associated glycoprotein | The protein encoded by this gene is a type I membrane protein and member of the immunoglobulin superfamily. It is thought to be involved in the process of myelination. It is a lectin that binds to sialylated glycoconjugates and mediates certain myelin-neuron cell-cell interactions. Three alternatively spliced transcripts encoding different isoforms have been described for this gene. [provided by RefSeq, Nov 2010] | CC_TC | 4 | Gy_chrXX |
| 12 | MAP1LC3C | 440738 | microtubule associated protein 1 light chain 3 gamma | Autophagy is a highly regulated bulk degradation process that plays an important role in cellular maintenance and development. MAP1LC3C is an ortholog of the yeast autophagosome protein Atg8 (He et al., 2003 [PubMed 12740394]).[supplied by OMIM, Nov 2010] | CC_TC | 4 | Gy_chrIV |
| 13 | ME3 | 10873 | malic enzyme 3 | Malic enzyme catalyzes the oxidative decarboxylation of malate to pyruvate using either NAD+ or NADP+ as a cofactor. Mammalian tissues contain 3 distinct isoforms of malic enzyme: a cytosolic NADP(+)-dependent isoform, a mitochondrial NADP(+)-dependent isoform, and a mitochondrial NAD(+)-dependent isoform. This gene encodes a mitochondrial NADP(+)-dependent isoform. Multiple alternatively spliced transcript variants have been found for this gene, but the biological validity of some variants has not been determined. [provided by RefSeq, Jul 2008] | CC_TC | 4 | Gy_chrXVI |
| 14 | MFSD1 | 64747 | major facilitator superfamily domain containing 1 | CC_TC | 4 | Gy_chrI | |
| 15 | NPBWR1 | 2831 | neuropeptides B and W receptor 1 | CC_TC | 4 | Gy_chrXII | |
| 17 | PRSS12 | 8492 | serine protease 12 | This gene encodes a member of the trypsin family of serine proteases and contains a signal peptide, a proline-rich region, a Kringle domain, four scavenger receptor cysteine-rich domains, and a trypsin-like serine protease domain. The protein, sometimes referred to as neurotrypsin or motopsin, is secreted from neuronal cells and localizes to the synaptic cleft. Studies in mice show that this protein cleaves a protein, agrin, that is important for the formation and maintenance of exitatory synapses. Defects in this gene cause a form of autosomal recessive cognitive impairment (MRT1). [provided by RefSeq, Jul 2017] | CC_TC | 4 | Gy_chrV |
| 18 | RASGEF1BB | NA | NA | NA | CC_TC | 4 | Gy_chrXIV |
| 19 | RSPH9 | 221421 | radial spoke head component 9 | This gene encodes a protein thought to be a component of the radial spoke head in motile cilia and flagella. Mutations in this gene are associated with primary ciliary dyskinesia 12. Alternative splicing results in multiple transcript variants.[provided by RefSeq, Jul 2010] | CC_TC | 4 | Gy_chrXVIII |
| 20 | SELENOF | 9403 | selenoprotein F | The protein encoded by this gene belongs to the SEP15/selenoprotein M family. The exact function of this protein is not known; however, it has been found to associate with UDP-glucose:glycoprotein glucosyltransferase (UGTR), an endoplasmic reticulum(ER)-resident protein, which is involved in the quality control of protein folding. The association with UGTR retains this protein in the ER, where it may play a role in protein folding. It has also been suggested to have a role in cancer etiology. This protein is a selenoprotein, containing the rare amino acid selenocysteine (Sec). Sec is encoded by the UGA codon, which normally signals translation termination. The 3’ UTRs of selenoprotein mRNAs contain a conserved stem-loop structure, designated the Sec insertion sequence (SECIS) element, that is necessary for the recognition of UGA as a Sec codon, rather than as a stop signal. Alternatively spliced transcript variants have been found for this gene. [provided by RefSeq, Nov 2016] | CC_TC | 4 | Gy_chrIII |
| 21 | TAFA1 | 407738 | TAFA chemokine like family member 1 | This gene is a member of the TAFA family which is composed of five highly homologous genes that encode small secreted proteins. These proteins contain conserved cysteine residues at fixed positions, and are distantly related to MIP-1alpha, a member of the CC-chemokine family. The TAFA proteins are predominantly expressed in specific regions of the brain, and are postulated to function as brain-specific chemokines or neurokines that act as regulators of immune and nervous cells. [provided by RefSeq, Jul 2008] | CC_TC | 4 | Gy_chrXII |
| 24 | YY1 | 7528 | YY1 transcription factor | YY1 is a ubiquitously distributed transcription factor belonging to the GLI-Kruppel class of zinc finger proteins. The protein is involved in repressing and activating a diverse number of promoters. YY1 may direct histone deacetylases and histone acetyltransferases to a promoter in order to activate or repress the promoter, thus implicating histone modification in the function of YY1. [provided by RefSeq, Jul 2008] | CC_TC | 4 | Gy_chrXVIII |
| GeneSymbol | ENTREZID | description | Summary | comparison | N_BP | seqid | |
|---|---|---|---|---|---|---|---|
| 6 | ELFN2 | 114794 | extracellular leucine rich repeat and fibronectin type III domain containing 2 | CT_TT | 8 | Gy_chrXI | |
| 19 | MNCB-2990 | NA | NA | NA | CT_TT | 6 | Gy_chrIII |
| 9 | FKBP3 | 2287 | FKBP prolyl isomerase 3 | The protein encoded by this gene is a member of the immunophilin protein family, which play a role in immunoregulation and basic cellular processes involving protein folding and trafficking. This encoded protein is a cis-trans prolyl isomerase that binds the immunosuppressants FK506 and rapamycin, as well as histone deacetylases, the transcription factor YY1, casein kinase II, and nucleolin. It has a higher affinity for rapamycin than for FK506 and thus may be an important target molecule for immunosuppression by rapamycin. [provided by RefSeq, Sep 2008] | CT_TT | 5 | Gy_chrXVIII |
| 10 | FZD2 | 2535 | frizzled class receptor 2 | This intronless gene is a member of the frizzled gene family. Members of this family encode seven-transmembrane domain proteins that are receptors for the wingless type MMTV integration site family of signaling proteins. This gene encodes a protein that is coupled to the beta-catenin canonical signaling pathway. Competition between the wingless-type MMTV integration site family, member 3A and wingless-type MMTV integration site family, member 5A gene products for binding of this protein is thought to regulate the beta-catenin-dependent and -independent pathways. [provided by RefSeq, Dec 2010] | CT_TT | 5 | Gy_chrXI |
| 11 | HAPLN3 | 145864 | hyaluronan and proteoglycan link protein 3 | This gene belongs to the hyaluronan and proteoglycan binding link protein gene family. The protein encoded by this gene may function in hyaluronic acid binding and cell adhesion. [provided by RefSeq, Jul 2008] | CT_TT | 5 | Gy_chrII |
| 20 | MTG2 | 26164 | mitochondrial ribosome associated GTPase 2 | Small G proteins, such as GTPBP5, act as molecular switches that play crucial roles in the regulation of fundamental cellular processes such as protein synthesis, nuclear transport, membrane trafficking, and signal transduction (Hirano et al., 2006 [PubMed 17054726]).[supplied by OMIM, Mar 2008] | CT_TT | 5 | Gy_chrXII |
| 24 | PHEX | 5251 | phosphate regulating endopeptidase X-linked | The protein encoded by this gene is a transmembrane endopeptidase that belongs to the type II integral membrane zinc-dependent endopeptidase family. The protein is thought to be involved in bone and dentin mineralization and renal phosphate reabsorption. Mutations in this gene cause X-linked hypophosphatemic rickets. Alternative splicing results in multiple transcript variants. [provided by RefSeq, Sep 2013] | CT_TT | 5 | Gy_chrXXI |
| 1 | ACBD5A | NA | NA | NA | CT_TT | 4 | Gy_chrXXI |
| 2 | CDPF1 | 150383 | cysteine rich DPF motif domain containing 1 | CT_TT | 4 | Gy_chrIV | |
| 3 | CHRNA9 | 55584 | cholinergic receptor nicotinic alpha 9 subunit | This gene is a member of the ligand-gated ionic channel family and nicotinic acetylcholine receptor gene superfamily. It encodes a plasma membrane protein that forms homo- or hetero-oligomeric divalent cation channels. This protein is involved in cochlea hair cell development and is also expressed in the outer hair cells (OHCs) of the adult cochlea. [provided by RefSeq, Feb 2012] | CT_TT | 4 | Gy_chrIV |
| 4 | CIAO2B | 51647 | cytosolic iron-sulfur assembly component 2B | CT_TT | 4 | Gy_chrVI | |
| 5 | EIF4A3 | 9775 | eukaryotic translation initiation factor 4A3 | This gene encodes a member of the DEAD box protein family. DEAD box proteins, characterized by the conserved motif Asp-Glu-Ala-Asp (DEAD), are putative RNA helicases. They are implicated in a number of cellular processes involving alteration of RNA secondary structure, such as translation initiation, nuclear and mitochondrial splicing, and ribosome and spliceosome assembly. Based on their distribution patterns, some members of this family are believed to be involved in embryogenesis, spermatogenesis, and cellular growth and division. The protein encoded by this gene is a nuclear matrix protein. Its amino acid sequence is highly similar to the amino acid sequences of the translation initiation factors eIF4AI and eIF4AII, two other members of the DEAD box protein family. [provided by RefSeq, Jul 2008] | CT_TT | 4 | Gy_chrIX |
| 7 | FAM107B | 83641 | family with sequence similarity 107 member B | CT_TT | 4 | Gy_chrXVII | |
| 8 | FHDC1 | 85462 | FH2 domain containing 1 | CT_TT | 4 | Gy_chrIX | |
| 12 | HBA2 | 3040 | hemoglobin subunit alpha 2 | The human alpha globin gene cluster located on chromosome 16 spans about 30 kb and includes seven loci: 5’- zeta - pseudozeta - mu - pseudoalpha-1 - alpha-2 - alpha-1 - theta - 3’. The alpha-2 (HBA2) and alpha-1 (HBA1) coding sequences are identical. These genes differ slightly over the 5’ untranslated regions and the introns, but they differ significantly over the 3’ untranslated regions. Two alpha chains plus two beta chains constitute HbA, which in normal adult life comprises about 97% of the total hemoglobin; alpha chains combine with delta chains to constitute HbA-2, which with HbF (fetal hemoglobin) makes up the remaining 3% of adult hemoglobin. Alpha thalassemias result from deletions of each of the alpha genes as well as deletions of both HBA2 and HBA1; some nondeletion alpha thalassemias have also been reported. [provided by RefSeq, Jul 2008] | CT_TT | 4 | Gy_chrV |
| 13 | HOXB9A | NA | NA | NA | CT_TT | 4 | Gy_chrXI |
| 14 | LGALSL | 29094 | galectin like | CT_TT | 4 | Gy_chrIX | |
| 15 | LTC4S | 4056 | leukotriene C4 synthase | The MAPEG (Membrane Associated Proteins in Eicosanoid and Glutathione metabolism) family includes a number of human proteins, several of which are involved the production of leukotrienes. This gene encodes an enzyme that catalyzes the first step in the biosynthesis of cysteinyl leukotrienes, potent biological compounds derived from arachidonic acid. Leukotrienes have been implicated as mediators of anaphylaxis and inflammatory conditions such as human bronchial asthma. This protein localizes to the nuclear envelope and adjacent endoplasmic reticulum. [provided by RefSeq, Jul 2008] | CT_TT | 4 | Gy_chrIV |
| 16 | MAP1LC3C | 440738 | microtubule associated protein 1 light chain 3 gamma | Autophagy is a highly regulated bulk degradation process that plays an important role in cellular maintenance and development. MAP1LC3C is an ortholog of the yeast autophagosome protein Atg8 (He et al., 2003 [PubMed 12740394]).[supplied by OMIM, Nov 2010] | CT_TT | 4 | Gy_chrIV |
| 17 | MAPK13 | 5603 | mitogen-activated protein kinase 13 | This gene encodes a member of the mitogen-activated protein (MAP) kinase family. MAP kinases act as an integration point for multiple biochemical signals, and are involved in a wide variety of cellular processes such as proliferation, differentiation, transcription regulation and development. The encoded protein is a p38 MAP kinase and is activated by proinflammatory cytokines and cellular stress. Substrates of the encoded protein include the transcription factor ATF2 and the microtubule dynamics regulator stathmin. Alternatively spliced transcript variants have been observed for this gene. [provided by RefSeq, Jul 2012] | CT_TT | 4 | Gy_chrXII |
| 18 | MC4R | 4160 | melanocortin 4 receptor | The protein encoded by this gene is a membrane-bound receptor and member of the melanocortin receptor family. The encoded protein interacts with adrenocorticotropic and MSH hormones and is mediated by G proteins. This is an intronless gene. Defects in this gene are a cause of autosomal dominant obesity. [provided by RefSeq, Jan 2010] | CT_TT | 4 | Gy_chrXXI |
| 21 | NANP | 140838 | N-acetylneuraminic acid phosphatase | CT_TT | 4 | Gy_chrVI | |
| 22 | OPN5 | 221391 | opsin 5 | Opsins are members of the guanine nucleotide-binding protein (G protein)-coupled receptor superfamily. This opsin gene is expressed in the eye, brain, testes, and spinal cord. This gene belongs to the seven-exon subfamily of mammalian opsin genes that includes peropsin (RRH) and retinal G protein coupled receptor (RGR). Like these other seven-exon opsin genes, this family member may encode a protein with photoisomerase activity. Alternative splicing results in multiple transcript variants. [provided by RefSeq, Jun 2010] | CT_TT | 4 | Gy_chrXVIII |
| 23 | OR11A1 | 26531 | olfactory receptor family 11 subfamily A member 1 | Olfactory receptors interact with odorant molecules in the nose, to initiate a neuronal response that triggers the perception of a smell. The olfactory receptor proteins are members of a large family of G-protein-coupled receptors (GPCR) arising from single coding-exon genes. Olfactory receptors share a 7-transmembrane domain structure with many neurotransmitter and hormone receptors and are responsible for the recognition and G protein-mediated transduction of odorant signals. The olfactory receptor gene family is the largest in the genome. The nomenclature assigned to the olfactory receptor genes and proteins for this organism is independent of other organisms. [provided by RefSeq, Jul 2008] | CT_TT | 4 | Gy_chrI |
| 25 | RIBC1 | 158787 | RIB43A domain with coiled-coils 1 | CT_TT | 4 | Gy_chrXII | |
| 26 | RPL10A | 4736 | ribosomal protein L10a | Ribosomes, the organelles that catalyze protein synthesis, consist of a small 40S subunit and a large 60S subunit. Together these subunits are composed of 4 RNA species and approximately 80 structurally distinct proteins. This gene encodes a ribosomal protein that is a component of the 60S subunit. The protein belongs to the L1P family of ribosomal proteins. It is located in the cytoplasm. The expression of this gene is downregulated in the thymus by cyclosporin-A (CsA), an immunosuppressive drug. Studies in mice have shown that the expression of the ribosomal protein L10a gene is downregulated in neural precursor cells during development. This gene previously was referred to as NEDD6 (neural precursor cell expressed, developmentally downregulated 6), but it has been renamed RPL10A (ribosomal protein 10a). As is typical for genes encoding ribosomal proteins, there are multiple processed pseudogenes of this gene dispersed through the genome. [provided by RefSeq, Jul 2008] | CT_TT | 4 | Gy_chrXII |
| 27 | S100Z | 170591 | S100 calcium binding protein Z | Members of the S100 protein family contain 2 calcium-binding EF-hands and exhibit cell-type specific expression patterns. For additional background information on S100 proteins, see MIM 114085.[supplied by OMIM, Mar 2008] | CT_TT | 4 | Gy_chrXIV |
| 28 | ZNF518B | 85460 | zinc finger protein 518B | CT_TT | 4 | Gy_chrIV |
| GeneSymbol | ENTREZID | description | Summary | comparison | N_BP | seqid | |
|---|---|---|---|---|---|---|---|
| 1 | CDPF1 | 150383 | cysteine rich DPF motif domain containing 1 | CC_CT | 5 | Gy_chrIV | |
| 6 | MAP1LC3C | 440738 | microtubule associated protein 1 light chain 3 gamma | Autophagy is a highly regulated bulk degradation process that plays an important role in cellular maintenance and development. MAP1LC3C is an ortholog of the yeast autophagosome protein Atg8 (He et al., 2003 [PubMed 12740394]).[supplied by OMIM, Nov 2010] | CC_CT | 5 | Gy_chrIV |
| 7 | NSUN6 | 221078 | NOP2/Sun RNA methyltransferase 6 | CC_CT | 5 | Gy_chrXXI | |
| 9 | PLPP5 | 84513 | phospholipid phosphatase 5 | CC_CT | 5 | Gy_chrXIII | |
| 2 | CHGA | 1113 | chromogranin A | The protein encoded by this gene is a member of the chromogranin/secretogranin family of neuroendocrine secretory proteins. It is found in secretory vesicles of neurons and endocrine cells. This gene product is a precursor to three biologically active peptides; vasostatin, pancreastatin, and parastatin. These peptides act as autocrine or paracrine negative modulators of the neuroendocrine system. Two other peptides, catestatin and chromofungin, have antimicrobial activity and antifungal activity, respectively. Two transcript variants encoding different isoforms have been found for this gene. [provided by RefSeq, Sep 2014] | CC_CT | 4 | Gy_chrXVIII |
| 3 | CIAO2B | 51647 | cytosolic iron-sulfur assembly component 2B | CC_CT | 4 | Gy_chrVI | |
| 4 | EPN3 | 55040 | epsin 3 | CC_CT | 4 | Gy_chrXI | |
| 5 | HBA2 | 3040 | hemoglobin subunit alpha 2 | The human alpha globin gene cluster located on chromosome 16 spans about 30 kb and includes seven loci: 5’- zeta - pseudozeta - mu - pseudoalpha-1 - alpha-2 - alpha-1 - theta - 3’. The alpha-2 (HBA2) and alpha-1 (HBA1) coding sequences are identical. These genes differ slightly over the 5’ untranslated regions and the introns, but they differ significantly over the 3’ untranslated regions. Two alpha chains plus two beta chains constitute HbA, which in normal adult life comprises about 97% of the total hemoglobin; alpha chains combine with delta chains to constitute HbA-2, which with HbF (fetal hemoglobin) makes up the remaining 3% of adult hemoglobin. Alpha thalassemias result from deletions of each of the alpha genes as well as deletions of both HBA2 and HBA1; some nondeletion alpha thalassemias have also been reported. [provided by RefSeq, Jul 2008] | CC_CT | 4 | Gy_chrV |
| 8 | OR11A1 | 26531 | olfactory receptor family 11 subfamily A member 1 | Olfactory receptors interact with odorant molecules in the nose, to initiate a neuronal response that triggers the perception of a smell. The olfactory receptor proteins are members of a large family of G-protein-coupled receptors (GPCR) arising from single coding-exon genes. Olfactory receptors share a 7-transmembrane domain structure with many neurotransmitter and hormone receptors and are responsible for the recognition and G protein-mediated transduction of odorant signals. The olfactory receptor gene family is the largest in the genome. The nomenclature assigned to the olfactory receptor genes and proteins for this organism is independent of other organisms. [provided by RefSeq, Jul 2008] | CC_CT | 4 | Gy_chrI |
| 10 | TRIM16 | 10626 | tripartite motif containing 16 | The protein encoded by this gene is a tripartite motif (TRIM) family member that contains two B box domains and a coiled-coiled region that are characteristic of the B box zinc finger protein family. While it lacks a RING domain found in other TRIM proteins, the encoded protein can homodimerize or heterodimerize with other TRIM proteins and has E3 ubiquitin ligase activity. This gene is also a tumor suppressor and is involved in secretory autophagy. [provided by RefSeq, Jan 2017] | CC_CT | 4 | Gy_chrV |
| GeneSymbol | ENTREZID | description | Summary | comparison | N_BP | seqid |
|---|---|---|---|---|---|---|
| CDCP1 | 64866 | CUB domain containing protein 1 | This gene encodes a transmembrane protein which contains three extracellular CUB domains and acts as a substrate for Src family kinases. The protein plays a role in the tyrosine phosphorylation-dependent regulation of cellular events that are involved in tumor invasion and metastasis. Alternative splicing results in multiple transcript variants of this gene. [provided by RefSeq, May 2013] | TC_TT | 5 | Gy_chrXX |
| LIM2 | 3982 | lens intrinsic membrane protein 2 | This gene encodes an eye lens-specific protein found at the junctions of lens fiber cells, where it may contribute to cell junctional organization. It acts as a receptor for calmodulin, and may play an important role in both lens development and cataractogenesis. Mutations in this gene have been associated with cataract formation. Alternatively spliced transcript variants encoding different isoforms have been found for this gene.[provided by RefSeq, Sep 2009] | TC_TT | 4 | Gy_chrI |
| MAP1LC3C | 440738 | microtubule associated protein 1 light chain 3 gamma | Autophagy is a highly regulated bulk degradation process that plays an important role in cellular maintenance and development. MAP1LC3C is an ortholog of the yeast autophagosome protein Atg8 (He et al., 2003 [PubMed 12740394]).[supplied by OMIM, Nov 2010] | TC_TT | 4 | Gy_chrIV |
| MGAT2 | 4247 | alpha-1,6-mannosyl-glycoprotein 2-beta-N-acetylglucosaminyltransferase | The product of this gene is a Golgi enzyme catalyzing an essential step in the conversion of oligomannose to complex N-glycans. The enzyme has the typical glycosyltransferase domains: a short N-terminal cytoplasmic domain, a hydrophobic non-cleavable signal-anchor domain, and a C-terminal catalytic domain. Mutations in this gene may lead to carbohydrate-deficient glycoprotein syndrome, type II. The coding region of this gene is intronless. Transcript variants with a spliced 5’ UTR may exist, but their biological validity has not been determined. [provided by RefSeq, Jul 2008] | TC_TT | 4 | Gy_chrXVIII |
| RNF167 | 26001 | ring finger protein 167 | RNF167 is an E3 ubiquitin ligase that interacts with TSSC5 (SLC22A18; MIM 602631) and, together with UBCH6 (UBE2E1; MIM 602916), facilitates TSSC5 polyubiquitylation (Yamada and Gorbsky, 2006 [PubMed 16314844]).[supplied by OMIM, Mar 2008] | TC_TT | 4 | Gy_chrVII |
| RPL28 | 6158 | ribosomal protein L28 | Ribosomes, the organelles that catalyze protein synthesis, consist of a small 40S subunit and a large 60S subunit. Together these subunits are composed of 4 RNA species and approximately 80 structurally distinct proteins. This gene encodes a ribosomal protein that is a component of the 60S subunit. The protein belongs to the L28E family of ribosomal proteins. It is located in the cytoplasm. Variable expression of this gene in colorectal cancers compared to adjacent normal tissues has been observed, although no correlation between the level of expression and the severity of the disease has been found. As is typical for genes encoding ribosomal proteins, there are multiple processed pseudogenes of this gene dispersed through the genome. Alternative splicing results in multiple transcript variants encoding distinct isoforms.[provided by RefSeq, Oct 2008] | TC_TT | 4 | Gy_chrXIV |
| ZNF518B | 85460 | zinc finger protein 518B | TC_TT | 4 | Gy_chrIV | |
| ZNF691 | 51058 | zinc finger protein 691 | TC_TT | 4 | Gy_chrV |
Approach:
PCA
extract axes 1 & 2
Correlation with parasite load/BCI
# 1. get raw values
## Keep DMS positions detected for parental effect (vecCompa 1 & 2)
DMSvec_parentalEffect = unique(unlist(c(get2keep(vecCompa[1], NBP = 4), get2keep(vecCompa[2], NBP = 4)))) # at least in 4 brother pairs (half)
pos2keep = which(paste(uniteCov14_G2_woSexAndUnknowChrOVERLAP$chr, uniteCov14_G2_woSexAndUnknowChrOVERLAP$start, sep = " ") %in% DMSvec_parentalEffect)
uniteAtDMS = methylKit::select(uniteCov14_G2_woSexAndUnknowChrOVERLAP, pos2keep)
percAtDMS = percMethylation(uniteAtDMS)
# Run PCA on complete data (CpG covered in all fish)
PCA_percAtDMS_all <- myPCA(x = t(na.omit(percAtDMS)), incomplete = FALSE)
## [1] "The chosen model is:"
## BCI ~ PCA2 + No.Worms + PAT + (1 | brotherPairID) + (1 | Sex) +
## PCA2:PAT + No.Worms:PAT
## <environment: 0x55c00f886988>
We perform a PCA on 706 CpG sites covered in all G2 individuals. We first perform a test on the complete dataset.
fviz_pca_ind(PCA_percAtDMS_all$res.PCA, label="none", habillage=PCA_percAtDMS_all$metadata$trtG1G2) +
scale_color_manual(values = colOffs)+
scale_shape_manual(values=c(19,19,19,19))
fviz_pca_ind(PCA_percAtDMS_all$res.PCA, label="none", habillage=as.factor(PCA_percAtDMS_all$metadata$brotherPairID))
# The function dimdesc() can be used to identify the most correlated variables with a given principal component.
mydimdesc <- dimdesc(PCA_percAtDMS_all$res.PCA, axes = c(1,2), proba = 0.05)
# There are `r nrow(mydimdesc$Dim.1$quanti)` CpG sites most correlated (p < 0.05) with the first principal component , and `r nrow(mydimdesc$Dim.2$quanti)` with the second principal component.
nrow(mydimdesc$Dim.1$quanti)
## [1] 352
nrow(mydimdesc$Dim.2$quanti)
## [1] 346
# The second PCA axis is significant in BCI (`r formula(PCA_percAtDMS_all$modSel)`).
formula(PCA_percAtDMS_all$modSel)
## BCI ~ PCA2 + No.Worms + PAT + (1 | brotherPairID) + (1 | Sex) +
## PCA2:PAT + No.Worms:PAT
## <environment: 0x55ab89de8368>
# Percentage of variance explained by each factor:
mod_noworms = lmer(BCI ~ PCA2 + PAT + PCA2:PAT + (1 | brotherPairID) + (1 | Sex), data = PCA_percAtDMS_all$metadata)
mod_noPAT = lmer(BCI ~ PCA2 + No.Worms + (1 | brotherPairID) + (1 | Sex), data = PCA_percAtDMS_all$metadata)
mod_noPCA2 = lmer(BCI ~ No.Worms + PAT + No.Worms:PAT + (1 | brotherPairID) + (1 | Sex), data = PCA_percAtDMS_all$metadata)
# R2c conditional R2 value associated with fixed effects plus the random effects.
A = (MuMIn::r.squaredGLMM(PCA_percAtDMS_all$modSel)[2] -
MuMIn::r.squaredGLMM(mod_noworms)[2])*100
B = (MuMIn::r.squaredGLMM(PCA_percAtDMS_all$modSel)[2] -
MuMIn::r.squaredGLMM(mod_noPAT)[2])*100
C = (MuMIn::r.squaredGLMM(PCA_percAtDMS_all$modSel)[2] -
MuMIn::r.squaredGLMM(mod_noPCA2)[2])*100
We found 1963 DMS linked with parental effect (comparisons CC_TC and CT_TT)
We use missMDA and FactoMineR for imputation of missing data and performing of PCA: (see:
# 1. get raw values
# Run PCA on complete data (CpG covered in all fish)
PCA_percAtDMS_imputed <- myPCA(x = t(percAtDMS), incomplete = TRUE)
## [1] "The chosen model is:"
## BCI ~ PCA2 + No.Worms + PAT + (1 | brotherPairID) + (1 | Sex) +
## PCA2:PAT + No.Worms:PAT
## <environment: 0x55ab944cc3d8>
fviz_pca_ind(PCA_percAtDMS_imputed$res.PCA, label="none", habillage=as.factor(PCA_percAtDMS_imputed$metadata$PAT)) +
scale_shape_manual(values=c(19,19))
fviz_pca_ind(PCA_percAtDMS_imputed$res.PCA, label="none", habillage=as.factor(PCA_percAtDMS_imputed$metadata$brotherPairID))
# The function dimdesc() can be used to identify the most correlated variables with a given principal component.
mydimdesc <- dimdesc(PCA_percAtDMS_imputed$res.PCA, axes = c(1,2), proba = 0.05)
# There are `r nrow(mydimdesc$Dim.1$quanti)` CpG sites most correlated (p < 0.05) with the first principal component , and `r nrow(mydimdesc$Dim.2$quanti)` with the second principal component.
nrow(mydimdesc$Dim.1$quanti)
## [1] 1006
nrow(mydimdesc$Dim.2$quanti)
## [1] 991
# The second PCA axis is significant in BCI (`r formula(PCA_percAtDMS_imputed$modSel)`).
formula(PCA_percAtDMS_imputed$modSel)
## BCI ~ PCA2 + No.Worms + PAT + (1 | brotherPairID) + (1 | Sex) +
## PCA2:PAT + No.Worms:PAT
## <environment: 0x55fec3bfa4b0>
# Percentage of variance explained by each factor:
mod_noworms = lmer(BCI ~ PCA2 + PAT + PCA2:PAT + (1 | brotherPairID) + (1 | Sex), data = PCA_percAtDMS_imputed$metadata)
mod_noPAT = lmer(BCI ~ PCA2 + No.Worms + (1 | brotherPairID) + (1 | Sex), data = PCA_percAtDMS_imputed$metadata)
mod_noPCA2 = lmer(BCI ~ No.Worms + PAT + No.Worms:PAT + (1 | brotherPairID) + (1 | Sex), data = PCA_percAtDMS_imputed$metadata)
# R2c conditional R2 value associated with fixed effects plus the random effects.
A = (MuMIn::r.squaredGLMM(PCA_percAtDMS_imputed$modSel)[2] -
MuMIn::r.squaredGLMM(mod_noworms)[2])*100
B = (MuMIn::r.squaredGLMM(PCA_percAtDMS_imputed$modSel)[2] -
MuMIn::r.squaredGLMM(mod_noPAT)[2])*100
C = (MuMIn::r.squaredGLMM(PCA_percAtDMS_imputed$modSel)[2] -
MuMIn::r.squaredGLMM(mod_noPCA2)[2])*100
plot(ggpredict(PCA_percAtDMS_imputed$modSel, terms = c("PCA2", "No.Worms", "PAT")), add.data = TRUE, alpha = .08) + theme_bw() +
scale_color_gradient(name = "Number of worms", low = "blue", high = "red")+
scale_fill_gradient(low = "blue", high = "red") +
ylab("Body Condition Index") +
ggtitle("Predicted values of Body Condition Index in offspring")
tbc
## Features Annotation (use package genomation v1.24.0)
## NB Promoters are defined by options at genomation::readTranscriptFeatures function.
## The default option is to take -1000,+1000bp around the TSS and you can change that.
## -> following Heckwolf 2020 and Sagonas 2020, we consider 1500bp upstream and 500 bp downstream
## Parents comparison:
diffAnn_PAR = annotateWithGeneParts(as(DMSlist$DMS_15pc_G1_C_T,"GRanges"),annotBed12)
## Offspring from control parents comparison:
diffAnn_G2_controlG1 = annotateWithGeneParts(as(DMSlist$DMS_15pc_CC_CT,"GRanges"),annotBed12)
## Offspring from infected parents comparison:
diffAnn_G2_infectedG1 = annotateWithGeneParts(as(DMSlist$DMS_15pc_TC_TT,"GRanges"),annotBed12)
par(mfrow=c(1,3))
par(mar = c(.1,0.1,5,0.1)) # Set the margin on all sides to 2
## Parents comparison:
diffAnn_PAR = annotateWithGeneParts(as(DMSlist$DMS_15pc_G1_C_T,"GRanges"),annotBed12)
diffAnn_PAR
## promoter exon intron intergenic
## 8.58 15.84 34.13 45.72
## promoter exon intron intergenic
## 8.58 13.19 32.51 45.72
## promoter exon intron
## 1.07 0.19 0.44
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3 3020 8128 15141 19226 300247
genomation::plotTargetAnnotation(diffAnn_PAR,precedence=TRUE, main="DMS G1", cex.legend = 1, border="white")
## Offspring from control parents comparison:
diffAnn_G2_controlG1 = annotateWithGeneParts(as(DMSlist$DMS_15pc_CC_CT,"GRanges"),annotBed12)
diffAnn_G2_controlG1
## promoter exon intron intergenic
## 11.28 21.05 30.16 44.44
## promoter exon intron intergenic
## 11.28 16.21 28.07 44.44
## promoter exon intron
## 0.38 0.07 0.14
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 11 2602 6295 14212 15906 261017
genomation::plotTargetAnnotation(diffAnn_G2_controlG1,precedence=TRUE, main="DMS G2-G1c", cex.legend = 1, border="white")
## Offspring from infected parents comparison:
diffAnn_G2_infectedG1 = annotateWithGeneParts(as(DMSlist$DMS_15pc_TC_TT,"GRanges"),annotBed12)
diffAnn_G2_infectedG1
## promoter exon intron intergenic
## 12.90 13.62 34.64 46.81
## promoter exon intron intergenic
## 12.90 9.42 30.87 46.81
## promoter exon intron
## 0.28 0.03 0.08
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 9 2355 7150 12285 14773 109527
genomation::plotTargetAnnotation(diffAnn_G2_infectedG1,precedence=TRUE, main="DMS G2-G1i", cex.legend = 1, border="white")
par(mfrow=c(1,1))
## Run the function to get DMS info
DMS_info_G1 <- myDMSinfo(DMSlist$DMS_15pc_G1_C_T, uniteCov6_G1_woSexAndUnknowChrOVERLAP)
DMS_info_G2_G1c_final <- myDMSinfo(DMSlist$DMS_15pc_CC_CT, uniteCov14_G2_woSexAndUnknowChrOVERLAP)
DMS_info_G2_G1i_final <- myDMSinfo(DMSlist$DMS_15pc_TC_TT,uniteCov14_G2_woSexAndUnknowChrOVERLAP)
NB Kostas’ results: “We found a total of 1,973 CpG sites out of 1,172,887 CpGs (0.17%) across the genome that showed at least 15% differential fractional methylation (differentially methylated site [DMS]; q < 0.01) between infected and uninfected fish”
Here: we obtain out a total of 1001880 CpG sites: 3648 (0.36%) showed at least 15% differential fractional methylation (differentially methylated site [DMS]; q < 0.01) between infected and uninfected fish in the parental group; 1197 (0.12%) in the offspring from control parents comparison; 690 (0.07%) in the offspring from infected parents comparison.
## Chi2 test: are the number of DMS from G2-G1C and G2-G1I overlapping with DMSpar statistically different?
A=length(intersect(DMS_info_G1$DMS,DMS_info_G2_G1c_final$DMS))
B=length(DMS_info_G2_G1c_final$DMS)
C=length(intersect(DMS_info_G1$DMS,DMS_info_G2_G1i_final$DMS))
D=length(DMS_info_G2_G1i_final$DMS)
Observed=matrix(c(A, B-A, C, D-C),nrow=2)
Observed
## [,1] [,2]
## [1,] 106 59
## [2,] 1091 631
chisq.test(Observed)
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: Observed
## X-squared = 0.019909, df = 1, p-value = 0.8878
## not statistically different
## output Venn diagrams
allVenn <- ggVennDiagram(list("DMS G1" = DMS_info_G1$DMS, "DMS G2-c" = DMS_info_G2_G1c_final$DMS, "DMS G2-i" = DMS_info_G2_G1i_final$DMS), label_alpha = 0) +
scale_fill_gradient(low="white",high = "red")
hypoVenn <- ggVennDiagram(list("DMS G1\nhypo" = DMS_info_G1$DMS[DMS_info_G1$direction %in% "hypo"],
"DMS G2-c\nhypo" = DMS_info_G2_G1c_final$DMS[DMS_info_G2_G1c_final$direction %in% "hypo"],
"DMS G2-i\nhypo" = DMS_info_G2_G1i_final$DMS[DMS_info_G2_G1i_final$direction %in% "hypo"]), label_alpha = 0) +
scale_fill_gradient(low="white",high = "red")
hyperVenn <- ggVennDiagram(list("DMS G1\nhyper" = DMS_info_G1$DMS[DMS_info_G1$direction %in% "hyper"],
"DMS G2-c\nhyper" = DMS_info_G2_G1c_final$DMS[DMS_info_G2_G1c_final$direction %in% "hyper"],
"DMS G2-i\nhyper" = DMS_info_G2_G1i_final$DMS[DMS_info_G2_G1i_final$direction %in% "hyper"]), label_alpha = 0) +
scale_fill_gradient(low="white",high = "red")
ggarrange(allVenn,
ggarrange(hypoVenn, hyperVenn, ncol = 2, legend = "none"),
nrow = 2, widths = c(.5,1))
runHyperHypoAnnot <- function(){
par(mfrow=c(2,3))
par(mar = c(.1,0.1,5,0.1)) # Set the margin on all sides to 2
####### HYPO
## Parents comparison:
A = annotateWithGeneParts(
as(DMSlist$DMS_15pc_G1_C_T[DMS_info_G1$direction %in% "hypo",],"GRanges"),annotBed12)
genomation::plotTargetAnnotation(A,precedence=TRUE, main="DMS G1\nhypo",
cex.legend = .4, border="white")
## Offspring from control parents comparison:
B = annotateWithGeneParts(
as(DMSlist$DMS_15pc_CC_CT[DMS_info_G2_G1c_final$direction %in% "hypo",],"GRanges"),annotBed12)
genomation::plotTargetAnnotation(B,precedence=TRUE, main="DMS G2-G1c\nhypo",
cex.legend = .4, border="white")
## Offspring from infected parents comparison:
C = annotateWithGeneParts(
as(DMSlist$DMS_15pc_TC_TT[DMS_info_G2_G1i_final$direction %in% "hypo",],"GRanges"),annotBed12)
genomation::plotTargetAnnotation(C,precedence=TRUE, main="DMS G2-G1i\nhypo",
cex.legend = .4, border="white")
####### HYPER
## Parents comparison:
D = annotateWithGeneParts(
as(DMSlist$DMS_15pc_G1_C_T[DMS_info_G1$direction %in% "hyper",],"GRanges"),annotBed12)
genomation::plotTargetAnnotation(D,precedence=TRUE, main="DMS G1\nhyper",
cex.legend = .4, border="white")
## Offspring from control parents comparison:
E = annotateWithGeneParts(
as(DMSlist$DMS_15pc_CC_CT[DMS_info_G2_G1c_final$direction %in% "hyper",],"GRanges"),annotBed12)
genomation::plotTargetAnnotation(E,precedence=TRUE, main="DMS G2-G1c\nhyper",
cex.legend = .4, border="white")
## Offspring from infected parents comparison:
f = annotateWithGeneParts(
as(DMSlist$DMS_15pc_TC_TT[DMS_info_G2_G1i_final$direction %in% "hyper",],"GRanges"),annotBed12)
genomation::plotTargetAnnotation(f,precedence=TRUE, main="DMS G2-G1i\nhyper",
cex.legend = .4, border="white")
par(mfrow=c(1,1))
return(list(G1hypo=A, G2G1chypo=B, G2G1ihypo=C, G1hyper=D, G2G1chyper=E, G2G1ihyper=f))
}
myannot=runHyperHypoAnnot()
############################################################
## Venn diagram of overlapping features by their annotation:
table(rowSums(as.data.frame(myannot$G1hypo@members))) # NB: some positions are labelled with several features!
##
## 0 1 2 3
## 559 566 49 2
## as in MBE 2021: "giving precedence to the following order promoters, exons,
## introns, and intergenic regions when features overlapped"
myAnnotateDMS <- function(DMS, annot){
## sanity check
if (nrow(DMS) != nrow(annot)){"STOP error in arguments"}
DMS$pos <- paste(DMS$chr, DMS$start, DMS$end)
## NB as in MBE 2021: "giving precedence to the following order promoters, exons,
## introns, and intergenic regions when features overlapped"
DMS$feature <- NA
## 1. promoters
DMS$feature[which(annot$prom == 1)] = "promoter"
## 2. exons
DMS$feature[which(annot$exon == 1 & annot$prom ==0)] = "exon"
## 3. intron
DMS$feature[which(annot$intro == 1 & annot$exon == 0 & annot$prom ==0)] = "intron"
## 4. intergenic regions
DMS$feature[which(annot$intro == 0 & annot$exon == 0 & annot$prom ==0)] = "intergenic"
return(DMS)
}
DMSlist$DMS_15pc_G1_C_T = myAnnotateDMS(DMSlist$DMS_15pc_G1_C_T, as.data.frame(diffAnn_PAR@members))
DMSlist$DMS_15pc_G1_C_T_HYPO = myAnnotateDMS(DMSlist$DMS_15pc_G1_C_T[DMS_info_G1$direction %in% "hypo",],
as.data.frame(myannot$G1hypo@members))
DMSlist$DMS_15pc_G1_C_T_HYPER = myAnnotateDMS(DMSlist$DMS_15pc_G1_C_T[DMS_info_G1$direction %in% "hyper",],
as.data.frame(myannot$G1hyper@members))
DMSlist$DMS_15pc_CC_CT = myAnnotateDMS(DMSlist$DMS_15pc_CC_CT, as.data.frame(diffAnn_G2_controlG1@members))
DMSlist$DMS_15pc_CC_CT_HYPO = myAnnotateDMS(DMSlist$DMS_15pc_CC_CT[DMS_info_G2_G1c_final$direction %in% "hypo",],
as.data.frame(myannot$G2G1chypo@members))
DMSlist$DMS_15pc_CC_CT_HYPER = myAnnotateDMS(DMSlist$DMS_15pc_CC_CT[DMS_info_G2_G1c_final$direction %in% "hyper",],
as.data.frame(myannot$G2G1chyper@members))
DMSlist$DMS_15pc_TC_TT = myAnnotateDMS(DMSlist$DMS_15pc_TC_TT, as.data.frame(diffAnn_G2_infectedG1@members))
DMSlist$DMS_15pc_TC_TT_HYPO = myAnnotateDMS(DMSlist$DMS_15pc_TC_TT[DMS_info_G2_G1i_final$direction %in% "hypo",],
as.data.frame(myannot$G2G1ihypo@members))
DMSlist$DMS_15pc_TC_TT_HYPER = myAnnotateDMS(DMSlist$DMS_15pc_TC_TT[DMS_info_G2_G1i_final$direction %in% "hyper",],
as.data.frame(myannot$G2G1ihyper@members))
## Make Venn diagram for each feature
getFeatureDFHYPO <- function(myfeat){
a = DMSlist$DMS_15pc_G1_C_T_HYPO$pos[DMSlist$DMS_15pc_G1_C_T_HYPO$feature %in% myfeat]
b = DMSlist$DMS_15pc_CC_CT_HYPO$pos[DMSlist$DMS_15pc_CC_CT_HYPO$feature %in% myfeat]
c = DMSlist$DMS_15pc_TC_TT_HYPO$pos[DMSlist$DMS_15pc_TC_TT_HYPO$feature %in% myfeat]
return(list(a=a,b=b,c=c))
}
getFeatureDFHYPER <- function(myfeat){
a = DMSlist$DMS_15pc_G1_C_T_HYPER$pos[DMSlist$DMS_15pc_G1_C_T_HYPER$feature %in% myfeat]
b = DMSlist$DMS_15pc_CC_CT_HYPER$pos[DMSlist$DMS_15pc_CC_CT_HYPER$feature %in% myfeat]
c = DMSlist$DMS_15pc_TC_TT_HYPER$pos[DMSlist$DMS_15pc_TC_TT_HYPER$feature %in% myfeat]
return(list(a=a,b=b,c=c))
}
getVenn <- function(feat, direction){
if (direction == "hypo"){
ggVennDiagram(list(A = getFeatureDFHYPO(feat)[["a"]],
B = getFeatureDFHYPO(feat)[["b"]],
C = getFeatureDFHYPO(feat)[["c"]]), label_alpha = 0,
category.names = c(paste0("DMS G1\nhypo\n", feat), paste0("DMS G2-c\nhypo\n", feat), paste0("DMS G2-i\nhypo\n", feat))) +
scale_fill_gradient(low="white",high = "red")
} else if (direction == "hyper"){
ggVennDiagram(list(A = getFeatureDFHYPER(feat)[["a"]],
B = getFeatureDFHYPER(feat)[["b"]],
C = getFeatureDFHYPER(feat)[["c"]]), label_alpha = 0,
category.names = c(paste0("DMS G1\nhyper\n", feat), paste0("DMS G2-c\nhyper\n", feat), paste0("DMS G2-i\nhyper\n", feat))) +
scale_fill_gradient(low="white",high = "red")
}
}
ggarrange(getVenn("promoter", "hypo"), getVenn("exon", "hypo"),
getVenn("intron", "hypo"), getVenn("intergenic", "hypo"),
nrow = 2, ncol = 2)
ggarrange(getVenn("promoter", "hyper"), getVenn("exon", "hyper"),
getVenn("intron", "hyper"), getVenn("intergenic", "hyper"),
nrow = 2, ncol = 2)
## Parents trt-ctrl
# load annotation
annot_PAR <- as.data.frame(diffAnn_PAR@members)
makeManhattanPlots(DMSfile = DMSlist$DMS_15pc_G1_C_T, annotFile = annot_PAR, GYgynogff = GYgynogff,
mycols = c("red", "grey", "black", "green"), mytitle = "Manhattan plot of G1 DMS")
## G2-G1c trt-ctrl
# load annotation
annot_G2_G1c <- as.data.frame(diffAnn_G2_controlG1@members)
makeManhattanPlots(DMSfile = DMSlist$DMS_15pc_CC_CT, annotFile = annot_G2_G1c, GYgynogff = GYgynogff,
mycols = c("red", "grey", "black", "green"), mytitle = "Manhattan plot of G2-G1c DMS")
## G2-G1i trt-ctrl
# load annotation
annot_G2_G1i <- as.data.frame(diffAnn_G2_infectedG1@members)
makeManhattanPlots(DMSfile = DMSlist$DMS_15pc_TC_TT, annotFile = annot_G2_G1i, GYgynogff = GYgynogff,
mycols = c("red", "grey", "black", "green"), mytitle = "Manhattan plot of G2-G1i DMS")
## Outliers in Manhattan plot: 15% diff + 2SD
outliers_G1_final <- which(abs(DMSlist$DMS_15pc_G1_C_T$meth.diff) > 15 + 2*sd(abs(DMSlist$DMS_15pc_G1_C_T$meth.diff)))
outliers_annot_G1 <- as.data.frame(diffAnn_PAR@members)[outliers_G1_final,]
makeManhattanPlots(DMSfile = DMSlist$DMS_15pc_G1_C_T[outliers_G1_final, ],
annotFile = outliers_annot_G1, GYgynogff = GYgynogff,
mycols = c("red", "grey", "black", "green"), mytitle = "Manhattan plot of G1 DMS")
outliers_G2_G1c_final <- which(abs(DMSlist$DMS_15pc_CC_CT$meth.diff) > 15 + 2*sd(abs(DMSlist$DMS_15pc_CC_CT$meth.diff)))
outliers_annot_G2_G1c <- as.data.frame(diffAnn_G2_controlG1@members)[outliers_G2_G1c_final,]
makeManhattanPlots(DMSfile = DMSlist$DMS_15pc_CC_CT[outliers_G2_G1c_final, ],
annotFile = outliers_annot_G2_G1c, GYgynogff = GYgynogff,
mycols = c("red", "grey", "black", "green"), mytitle = "Manhattan plot of G2-G1c DMS")
outliers_G2_G1i_final <- which(abs(DMSlist$DMS_15pc_TC_TT$meth.diff) > 15 + 2*sd(abs(DMSlist$DMS_15pc_TC_TT$meth.diff)))
outliers_annot_G2_G1i <- as.data.frame(diffAnn_G2_infectedG1@members)[outliers_G2_G1i_final,]
makeManhattanPlots(DMSfile = DMSlist$DMS_15pc_TC_TT[outliers_G2_G1i_final, ],
annotFile = outliers_annot_G2_G1i, GYgynogff = GYgynogff,
mycols = c("red", "grey", "black", "green"), mytitle = "Manhattan plot of G2-G1i DMS")
Question: how are the beta values in the different groups at the parental DMS?
##############
## Prepare dataset
PM_G1 <- getPMdataset(uniteCov = uniteCov6_G1_woSexAndUnknowChrOVERLAP, MD = fullMetadata_PAR, gener="parents")
PM_G2 <- getPMdataset(uniteCov = uniteCov14_G2_woSexAndUnknowChrOVERLAP, MD = fullMetadata_OFFS, gener="offspring")
Test of VCA: variance component analysis https://cran.r-project.org/web/packages/VCA/vignettes/VCA_package_vignette.html
PM_G2_mean_hypo <- PM_G2[PM_G2$hypohyper %in% "hypo", ] %>%
group_by(brotherPairID, G1_trt, G2_trt, ID) %>%
dplyr::summarize(MeanBetaValue = mean(BetaValue, na.rm=TRUE)) %>% data.frame()
varPlot(form = MeanBetaValue~(G1_trt* G2_trt*brotherPairID), Data = PM_G2_mean_hypo,
MeanLine=list(var=c("G1_trt", "G2_trt"),
col=c("white", "blue"), lwd=c(2,2)),
BG=list(var="G2_trt", col=paste0("gray", c(80, 90))),
YLabel=list(cex = .8, text="Mean beta value at parDMS \n hypomethylated upon infection"))
myfitVCA_hypo <- fitVCA(form = MeanBetaValue~(G1_trt* G2_trt*brotherPairID), Data = PM_G2_mean_hypo)
### Real values
trtEffect <- sum(myfitVCA_hypo$aov.tab[2:4, 5])
genEffect <- sum(myfitVCA_hypo$aov.tab[5:8, 5])
error <- sum(myfitVCA_hypo$aov.tab[9, 5])
realValHypoVCA <- data.frame(trtEffect=trtEffect, genEffect=genEffect,error=error)
### Randomisation
myrandomVCA <- function(df=PM_G2_mean_hypo){
randomDF = df
randomDF$G1_trt = sample(PM_G2_mean_hypo$G1_trt, replace = F)
randomDF$G2_trt = sample(PM_G2_mean_hypo$G2_trt, replace = F)
randomDF$brotherPairID = sample(PM_G2_mean_hypo$brotherPairID, replace = F)
myfitVCA <- fitVCA(form = MeanBetaValue~(G1_trt* G2_trt*brotherPairID), Data = randomDF)
trtEffect <- sum(myfitVCA$aov.tab[2:4, 5])
genEffect <- sum(myfitVCA$aov.tab[5:8, 5])
error <- sum(myfitVCA$aov.tab[9, 5])
return(data.frame(trtEffect=trtEffect, genEffect=genEffect,error=error))
}
# randomHypoVCA = do.call(rbind, lapply(1:1000, function(x) {
# df=myrandomVCA(PM_G2_mean_hypo)
# df$rep=x
# return(df)}))
# randomHypoVCA = melt(randomHypoVCA, id.vars = "rep")
# saveRDS(randomHypoVCA, file = "Rdata/randomHypoVCA.RDS")
randomHypoVCA <- readRDS(file = "Rdata/randomHypoVCA.RDS")
df2=reshape2::melt(realValHypoVCA)
sumDF <- randomHypoVCA %>%
group_by(variable) %>%
dplyr::summarize(value = mean(value)) %>% data.frame()
ggplot(randomHypoVCA, aes(x=variable, y=value))+
geom_boxplot()+
geom_jitter(width=.1, alpha=.2)+
geom_point(data = df2, col = "red", size = 6)+
geom_text(data=sumDF, aes(label=round(value)), col="white")+
geom_text(data = df2, aes(label=round(value)), col="white")+
theme_cleveland()+
ggtitle("VCA with bootstrap N=1000 at hypo-parDMS", subtitle = "red: observed values")
# estimate 95% confidence intervals, request CI for
# all variance components via 'VarVC=TRUE'
VCAinference(myfitVCA_hypo, VarVC=TRUE)
##
##
##
## Inference from (V)ariance (C)omponent (A)nalysis
## ------------------------------------------------
##
## > VCA Result:
## -------------
##
## Name DF SS MS VC %Total SD
## 1 total 6.6733 15.8572 100 3.9821
## 2 G1_trt 1 319.6035 319.6035 4.7515 29.9646 2.1798
## 3 G2_trt 1 3.8987 3.8987 0.0712 0.4491 0.2669
## 4 G1_trt:G2_trt 1 1.5539 1.5539 0* 0* 0*
## 5 brotherPairID 7 129.185 18.455 0* 0* 0*
## 6 G1_trt:brotherPairID 7 395.9735 56.5676 7.8576 49.5525 2.8031
## 7 G2_trt:brotherPairID 7 9.9214 1.4173 0* 0* 0*
## 8 G1_trt:G2_trt:brotherPairID 7 20.2165 2.8881 0* 0* 0*
## 9 error 79 250.9663 3.1768 3.1768 20.0337 1.7824
## CV[%] Var(VC)
## 1 6.6047
## 2 3.6154 66.6443
## 3 0.4426 0.0124
## 4 0* 0.0096
## 5 0* 5.4751
## 6 4.6493 19.7869
## 7 0* 0.068
## 8 0* 0.2383
## 9 2.9562 0.2555
##
## Mean: 60.2922 (N = 111)
##
## Experimental Design: unbalanced | Method: ANOVA | * VC set to 0 | adapted MS used for total DF
##
##
## > VC:
## -----
## Estimate CI LCL CI UCL One-Sided LCL One-Sided UCL
## total 15.8572 6.824 68.824 7.787 53.1883
## G1_trt 4.7515 0* 20.7519 0* 18.1795
## G2_trt 0.0712 0* 0.2897 0* 0.2545
## G1_trt:G2_trt 0
## brotherPairID 0
## G1_trt:brotherPairID 7.8576 0* 16.576 0.5409 15.1744
## G2_trt:brotherPairID 0
## G1_trt:G2_trt:brotherPairID 0
## error 3.1768 2.3794 4.457 2.491 4.2163
##
## > SD:
## -----
## Estimate CI LCL CI UCL One-Sided LCL One-Sided UCL
## total 3.9821 2.6123 8.296 2.7905 7.293
## G1_trt 2.1798 0* 4.5554 0* 4.2637
## G2_trt 0.2669 0* 0.5382 0* 0.5045
## G1_trt:G2_trt 0
## brotherPairID 0
## G1_trt:brotherPairID 2.8031 0* 4.0714 0.7355 3.8954
## G2_trt:brotherPairID 0
## G1_trt:G2_trt:brotherPairID 0
## error 1.7824 1.5425 2.1112 1.5783 2.0534
##
## > CV[%]:
## --------
## Estimate CI LCL CI UCL One-Sided LCL One-Sided UCL
## total 6.6047 4.3327 13.7597 4.6283 12.0961
## G1_trt 3.6154 0* 7.5556 0* 7.0718
## G2_trt 0.4426 0* 0.8926 0* 0.8368
## G1_trt:G2_trt 0
## brotherPairID 0
## G1_trt:brotherPairID 4.6493 0* 6.7527 1.2199 6.4609
## G2_trt:brotherPairID 0
## G1_trt:G2_trt:brotherPairID 0
## error 2.9562 2.5584 3.5015 2.6177 3.4057
##
##
## 95% Confidence Level | CIs for negative VCs excluded | * CI-limits constrained to be >= 0
## SAS PROC MIXED method used for computing CIs
PM_G2_mean_hyper <- PM_G2[PM_G2$hypohyper %in% "hyper", ] %>%
group_by(brotherPairID, G1_trt, G2_trt, ID) %>%
dplyr::summarize(MeanBetaValue = mean(BetaValue, na.rm=TRUE)) %>% data.frame()
varPlot(form = MeanBetaValue~(G1_trt* G2_trt*brotherPairID), Data = PM_G2_mean_hyper,
MeanLine=list(var=c("G1_trt", "G2_trt"),
col=c("white", "blue"), lwd=c(2,2)),
BG=list(var="G2_trt", col=paste0("gray", c(80, 90))),
YLabel=list(cex = .8, text="Mean beta value at parDMS \n hypermethylated upon infection"))
myfitVCA_hyper <- fitVCA(form = MeanBetaValue~(G1_trt* G2_trt*brotherPairID), Data = PM_G2_mean_hyper)
### Real values
trtEffect <- sum(myfitVCA_hyper$aov.tab[2:4, 5])
genEffect <- sum(myfitVCA_hyper$aov.tab[5:8, 5])
error <- sum(myfitVCA_hyper$aov.tab[9, 5])
realValHyperVCA <- data.frame(trtEffect=trtEffect, genEffect=genEffect,error=error)
### Randomisation
# randomHyperVCA = do.call(rbind, lapply(1:1000, function(x) {
# df=myrandomVCA(PM_G2_mean_hyper)
# df$rep=x
# return(df)}))
#
# randomHyperVCA = melt(randomHyperVCA, id.vars = "rep")
# saveRDS(randomHyperVCA, file = "Rdata/randomHyperVCA.RDS")
randomHyperVCA <- readRDS(file = "Rdata/randomHyperVCA.RDS")
df2=reshape2::melt(realValHyperVCA)
sumDF <- randomHyperVCA %>%
group_by(variable) %>%
dplyr::summarize(value = mean(value)) %>% data.frame()
ggplot(randomHyperVCA, aes(x=variable, y=value))+
geom_boxplot()+
geom_jitter(width=.1, alpha=.2)+
geom_point(data = df2, col = "red", size = 6)+
geom_text(data=sumDF, aes(label=round(value)), col="white")+
geom_text(data = df2, aes(label=round(value)), col="white")+
theme_cleveland()+
ggtitle("VCA with bootstrap N=1000 at hyper-parDMS", subtitle = "red: observed values")
# estimate 95% confidence intervals, request CI for
# all variance components via 'VarVC=TRUE'
VCAinference(myfitVCA_hypo, VarVC=TRUE)
##
##
##
## Inference from (V)ariance (C)omponent (A)nalysis
## ------------------------------------------------
##
## > VCA Result:
## -------------
##
## Name DF SS MS VC %Total SD
## 1 total 6.6733 15.8572 100 3.9821
## 2 G1_trt 1 319.6035 319.6035 4.7515 29.9646 2.1798
## 3 G2_trt 1 3.8987 3.8987 0.0712 0.4491 0.2669
## 4 G1_trt:G2_trt 1 1.5539 1.5539 0* 0* 0*
## 5 brotherPairID 7 129.185 18.455 0* 0* 0*
## 6 G1_trt:brotherPairID 7 395.9735 56.5676 7.8576 49.5525 2.8031
## 7 G2_trt:brotherPairID 7 9.9214 1.4173 0* 0* 0*
## 8 G1_trt:G2_trt:brotherPairID 7 20.2165 2.8881 0* 0* 0*
## 9 error 79 250.9663 3.1768 3.1768 20.0337 1.7824
## CV[%] Var(VC)
## 1 6.6047
## 2 3.6154 66.6443
## 3 0.4426 0.0124
## 4 0* 0.0096
## 5 0* 5.4751
## 6 4.6493 19.7869
## 7 0* 0.068
## 8 0* 0.2383
## 9 2.9562 0.2555
##
## Mean: 60.2922 (N = 111)
##
## Experimental Design: unbalanced | Method: ANOVA | * VC set to 0 | adapted MS used for total DF
##
##
## > VC:
## -----
## Estimate CI LCL CI UCL One-Sided LCL One-Sided UCL
## total 15.8572 6.824 68.824 7.787 53.1883
## G1_trt 4.7515 0* 20.7519 0* 18.1795
## G2_trt 0.0712 0* 0.2897 0* 0.2545
## G1_trt:G2_trt 0
## brotherPairID 0
## G1_trt:brotherPairID 7.8576 0* 16.576 0.5409 15.1744
## G2_trt:brotherPairID 0
## G1_trt:G2_trt:brotherPairID 0
## error 3.1768 2.3794 4.457 2.491 4.2163
##
## > SD:
## -----
## Estimate CI LCL CI UCL One-Sided LCL One-Sided UCL
## total 3.9821 2.6123 8.296 2.7905 7.293
## G1_trt 2.1798 0* 4.5554 0* 4.2637
## G2_trt 0.2669 0* 0.5382 0* 0.5045
## G1_trt:G2_trt 0
## brotherPairID 0
## G1_trt:brotherPairID 2.8031 0* 4.0714 0.7355 3.8954
## G2_trt:brotherPairID 0
## G1_trt:G2_trt:brotherPairID 0
## error 1.7824 1.5425 2.1112 1.5783 2.0534
##
## > CV[%]:
## --------
## Estimate CI LCL CI UCL One-Sided LCL One-Sided UCL
## total 6.6047 4.3327 13.7597 4.6283 12.0961
## G1_trt 3.6154 0* 7.5556 0* 7.0718
## G2_trt 0.4426 0* 0.8926 0* 0.8368
## G1_trt:G2_trt 0
## brotherPairID 0
## G1_trt:brotherPairID 4.6493 0* 6.7527 1.2199 6.4609
## G2_trt:brotherPairID 0
## G1_trt:G2_trt:brotherPairID 0
## error 2.9562 2.5584 3.5015 2.6177 3.4057
##
##
## 95% Confidence Level | CIs for negative VCs excluded | * CI-limits constrained to be >= 0
## SAS PROC MIXED method used for computing CIs
parmod <- lmer(data = PM_G1, BetaValue ~ meth.diff.parentals : Treatment + (1|CpGSite) + (1|brotherPairID))
## check normality of residuals assumption
qqnorm(resid(parmod))
qqline(resid(parmod))
pred <- ggpredict(parmod, terms = c("meth.diff.parentals", "Treatment"))
plot(pred, add.data = T)+
scale_color_manual(values = c("black", "red"))+
scale_y_continuous(name = "Beta values")+
scale_x_continuous(name = "Methylation difference between infected and control parents in percentage")+
ggtitle("Predicted methylation ratio (Beta) values in parents\n as a function of differential methylation between exposed and control groups")+
theme_bw()
modFull <- lmer(BetaValue ~ (G1_trt * G2_trt):hypohyper + (1|CpGSite) + (1|Sex) + (1|brotherPairID),data = PM_G2, REML = F) # REML =F for model comparison
mod_noG1trt <- lmer(BetaValue ~ G2_trt:hypohyper + (1|CpGSite)+ (1|Sex) + (1|brotherPairID), data = PM_G2, REML = F)
mod_noG2trt <-lmer(BetaValue ~ G1_trt:hypohyper + (1|CpGSite) + (1|Sex) + (1|brotherPairID), data = PM_G2, REML = F)
mod_noInteractions <- lmer(BetaValue ~ (G1_trt + G2_trt):hypohyper + (1|CpGSite) + (1|Sex) + (1|brotherPairID), data = PM_G2, REML = F)
mod_noHypoHyper <- lmer(BetaValue ~ (G1_trt * G2_trt) + (1|CpGSite) + (1|Sex) + (1|brotherPairID), data = PM_G2, REML = F)
## check normality of residuals assumption
qqnorm(resid(modFull))
qqline(resid(modFull))
## Likelihood ratio tests for all variables:
lrtest(modFull, mod_noG1trt) # G1 trt is VERY VERY significant (LRT: χ² (4) = 1163.6, p < 0.001)
## Likelihood ratio test
##
## Model 1: BetaValue ~ (G1_trt * G2_trt):hypohyper + (1 | CpGSite) + (1 |
## Sex) + (1 | brotherPairID)
## Model 2: BetaValue ~ G2_trt:hypohyper + (1 | CpGSite) + (1 | Sex) + (1 |
## brotherPairID)
## #Df LogLik Df Chisq Pr(>Chisq)
## 1 12 -1515719
## 2 8 -1516301 -4 1163.6 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
lrtest(modFull, mod_noG2trt) # G2 trt is VERY VERY significant (LRT: χ² (4) = 30.02, p < 0.001) NB that changed when brotherpair is used instead of family!
## Likelihood ratio test
##
## Model 1: BetaValue ~ (G1_trt * G2_trt):hypohyper + (1 | CpGSite) + (1 |
## Sex) + (1 | brotherPairID)
## Model 2: BetaValue ~ G1_trt:hypohyper + (1 | CpGSite) + (1 | Sex) + (1 |
## brotherPairID)
## #Df LogLik Df Chisq Pr(>Chisq)
## 1 12 -1515719
## 2 8 -1515734 -4 30.022 4.844e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
lrtest(modFull, mod_noInteractions) # interactions are significant (LRT: χ² (2) = 9.21, p < 0.01)
## Likelihood ratio test
##
## Model 1: BetaValue ~ (G1_trt * G2_trt):hypohyper + (1 | CpGSite) + (1 |
## Sex) + (1 | brotherPairID)
## Model 2: BetaValue ~ (G1_trt + G2_trt):hypohyper + (1 | CpGSite) + (1 |
## Sex) + (1 | brotherPairID)
## #Df LogLik Df Chisq Pr(>Chisq)
## 1 12 -1515719
## 2 10 -1515724 -2 9.211 0.009997 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
lrtest(modFull, mod_noHypoHyper) # hypo/hyper VERY VERY significant (LRT: χ² (4) = 1140, p < 0.001)
## Likelihood ratio test
##
## Model 1: BetaValue ~ (G1_trt * G2_trt):hypohyper + (1 | CpGSite) + (1 |
## Sex) + (1 | brotherPairID)
## Model 2: BetaValue ~ (G1_trt * G2_trt) + (1 | CpGSite) + (1 | Sex) + (1 |
## brotherPairID)
## #Df LogLik Df Chisq Pr(>Chisq)
## 1 12 -1515719
## 2 8 -1516289 -4 1140.4 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Post-hoc tests between treatments
modFull <- lmer(BetaValue ~ (G1_trt * G2_trt):hypohyper + (1|CpGSite) + (1|Sex) + (1|brotherPairID),data = PM_G2)
modFull_emmeans <- emmeans(modFull, list(pairwise ~ (G1_trt:G2_trt):hypohyper), adjust = "tukey")
modFull_emmeans
## $`emmeans of G1_trt, G2_trt, hypohyper`
## G1_trt G2_trt hypohyper emmean SE df asymp.LCL asymp.UCL
## Control Control hypo 62.1 0.960 Inf 60.2 64.0
## Exposed Control hypo 58.8 0.959 Inf 57.0 60.7
## Control Exposed hypo 62.3 0.960 Inf 60.4 64.2
## Exposed Exposed hypo 58.9 0.960 Inf 57.1 60.8
## Control Control hyper 56.6 0.872 Inf 54.9 58.3
## Exposed Control hyper 58.7 0.872 Inf 57.0 60.4
## Control Exposed hyper 55.8 0.872 Inf 54.1 57.6
## Exposed Exposed hyper 58.6 0.872 Inf 56.9 60.3
##
## Degrees-of-freedom method: asymptotic
## Confidence level used: 0.95
##
## $`pairwise differences of G1_trt, G2_trt, hypohyper`
## contrast estimate SE df z.ratio
## Control Control hypo - Exposed Control hypo 3.2670 0.199 Inf 16.439
## Control Control hypo - Control Exposed hypo -0.2024 0.205 Inf -0.989
## Control Control hypo - Exposed Exposed hypo 3.1756 0.202 Inf 15.735
## Control Control hypo - Control Control hyper 5.5292 0.672 Inf 8.226
## Control Control hypo - Exposed Control hyper 3.3925 0.672 Inf 5.049
## Control Control hypo - Control Exposed hyper 6.2691 0.673 Inf 9.317
## Control Control hypo - Exposed Exposed hyper 3.5420 0.672 Inf 5.268
## Exposed Control hypo - Control Exposed hypo -3.4694 0.202 Inf -17.160
## Exposed Control hypo - Exposed Exposed hypo -0.0913 0.199 Inf -0.460
## Exposed Control hypo - Control Control hyper 2.2623 0.671 Inf 3.369
## Exposed Control hypo - Exposed Control hyper 0.1256 0.671 Inf 0.187
## Exposed Control hypo - Control Exposed hyper 3.0021 0.672 Inf 4.467
## Exposed Control hypo - Exposed Exposed hyper 0.2751 0.671 Inf 0.410
## Control Exposed hypo - Exposed Exposed hypo 3.3780 0.205 Inf 16.479
## Control Exposed hypo - Control Control hyper 5.7317 0.673 Inf 8.514
## Control Exposed hypo - Exposed Control hyper 3.5950 0.673 Inf 5.342
## Control Exposed hypo - Control Exposed hyper 6.4715 0.674 Inf 9.608
## Control Exposed hypo - Exposed Exposed hyper 3.7444 0.673 Inf 5.561
## Exposed Exposed hypo - Control Control hyper 2.3536 0.672 Inf 3.501
## Exposed Exposed hypo - Exposed Control hyper 0.2169 0.672 Inf 0.323
## Exposed Exposed hypo - Control Exposed hyper 3.0935 0.673 Inf 4.597
## Exposed Exposed hypo - Exposed Exposed hyper 0.3664 0.672 Inf 0.545
## Control Control hyper - Exposed Control hyper -2.1367 0.137 Inf -15.618
## Control Control hyper - Control Exposed hyper 0.7398 0.141 Inf 5.251
## Control Control hyper - Exposed Exposed hyper -1.9872 0.139 Inf -14.332
## Exposed Control hyper - Control Exposed hyper 2.8765 0.140 Inf 20.557
## Exposed Control hyper - Exposed Exposed hyper 0.1495 0.137 Inf 1.091
## Control Exposed hyper - Exposed Exposed hyper -2.7271 0.141 Inf -19.290
## p.value
## <.0001
## 0.9761
## <.0001
## <.0001
## <.0001
## <.0001
## <.0001
## <.0001
## 0.9998
## 0.0172
## 1.0000
## 0.0002
## 0.9999
## <.0001
## <.0001
## <.0001
## <.0001
## <.0001
## 0.0110
## 1.0000
## 0.0001
## 0.9994
## <.0001
## <.0001
## <.0001
## <.0001
## 0.9589
## <.0001
##
## Degrees-of-freedom method: asymptotic
## P value adjustment: tukey method for comparing a family of 8 estimates
P1 <- plot(modFull_emmeans, by = "hypohyper", comparisons = TRUE) +
# coord_flip()+
theme_bw() +
ggtitle("Estimated marginal means of methylation ratio (beta)\n of offspring at parental DMS")+
theme(legend.position = "none", axis.title.x = element_blank()) +
scale_x_continuous("Beta value (methylation ratio)", limits = c(47,69.5))
## NB: Comparison arrows: https://cran.r-project.org/web/packages/emmeans/vignettes/xplanations.html
## two estimated marginal means (EMMs) differ significantly if, and only if, their respective comparison arrows do not overlap
## These comparison arrows are decidedly not the same as confidence intervals.
## Confidence intervals for EMMs are based on the statistical properties of the individual EMMs, whereas comparison arrows
## are based on the statistical properties of differences of EMMs.
## Add the PARENTAL DMS value
## Same test on ALL, G1 and G2 fish
modFullG1 <- lmer(BetaValue ~ G1_trt:hypohyper + (1|CpGSite) + (1|brotherPairID), data = PM_G1)
modFullG1_emmeans <- emmeans(modFullG1, list(pairwise ~ G1_trt:hypohyper), adjust = "tukey")
modFullG1_emmeans
## $`emmeans of G1_trt, hypohyper`
## G1_trt hypohyper emmean SE df asymp.LCL asymp.UCL
## Control hypo 66.9 0.657 Inf 65.7 68.2
## Exposed hypo 49.1 0.661 Inf 47.8 50.4
## Control hyper 48.6 0.534 Inf 47.6 49.6
## Exposed hyper 65.8 0.536 Inf 64.7 66.8
##
## Degrees-of-freedom method: asymptotic
## Confidence level used: 0.95
##
## $`pairwise differences of G1_trt, hypohyper`
## 1 estimate SE df z.ratio p.value
## Control hypo - Exposed hypo 17.889 0.303 Inf 59.074 <.0001
## Control hypo - Control hyper 18.343 0.643 Inf 28.539 <.0001
## Control hypo - Exposed hyper 1.179 0.645 Inf 1.829 0.2597
## Exposed hypo - Control hyper 0.454 0.647 Inf 0.701 0.8966
## Exposed hypo - Exposed hyper -16.711 0.649 Inf -25.761 <.0001
## Control hyper - Exposed hyper -17.164 0.209 Inf -82.107 <.0001
##
## Degrees-of-freedom method: asymptotic
## P value adjustment: tukey method for comparing a family of 4 estimates
P2 <- plot(modFullG1_emmeans, by = "hypohyper", comparisons = TRUE) +
theme_bw() +
ggtitle("Estimated marginal means of methylation ratio (beta)\n of parents at DMS")+
theme(legend.position = "none", axis.title.x = element_blank()) +
scale_x_continuous("Beta value (methylation ratio)", limits = c(47,69.5))
ggarrange(P2, P1, labels = c("A", "B"), ncol = 1, nrow = 2)
length(unique(PM_G1$CpGSite))# 3648 positions
## [1] 3648
PM_G1 %>% dplyr::count(ID)## NB: not all covered in all samples
## ID n
## 1 S16 3300
## 2 S17 2842
## 3 S35 2747
## 4 S36 2568
## 5 S52 3037
## 6 S53 3468
## 7 S68 3348
## 8 S69 3500
## 9 S70 2237
## 10 S71 3302
## 11 S72 3479
## 12 S73 2973
## 13 S76 3239
## 14 S77 3232
## 15 S78 3434
## 16 S79 2679
## 17 S94 2564
## 18 S95 1766
## 19 S107 3414
## 20 S108 3230
## 21 S124 3387
## 22 S125 2526
## 23 S143 3097
## 24 S144 2773
length(unique(PM_G2$CpGSite[PM_G2$hypohyper %in% "hypo"]))# 1176 positions hypomethylated upon parental inf
## [1] 1176
length(unique(PM_G2$CpGSite[PM_G2$hypohyper %in% "hyper"]))# 2472 positions hypermethylated upon parental inf
## [1] 2472
myfun <- function(X){
## Calculate nbr of CpG hypo/hypermethylated per individual, and nbr of covered CpG:
X <- X %>% group_by(ID, Treatment, brotherPairID, clutch.ID, Sex) %>%
dplyr::summarise(ncov = n(),
hypoMeth = sum(BetaValue < 0.3),
hyperMeth = sum(BetaValue > 0.7)) %>% data.frame()
# Calculate residuals of nbr of methCpG by nbr of covered CpG
X$res_Nbr_methCpG_Nbr_coveredCpG_HYPO = residuals(lm(X$hypoMeth ~ X$ncov))
X$res_Nbr_methCpG_Nbr_coveredCpG_HYPER = residuals(lm(X$hyperMeth ~ X$ncov))
## Statistical model: is it different in each offspring trt group?
mod1 <- lmer(res_Nbr_methCpG_Nbr_coveredCpG_HYPO ~ Treatment + (1|brotherPairID/clutch.ID) + (1|Sex),
data = X, REML = F)
mod0 <- lmer(res_Nbr_methCpG_Nbr_coveredCpG_HYPO ~ 1 + (1|brotherPairID/clutch.ID) + (1|Sex),
data = X, REML = F)
print(lrtest(mod1, mod0))
## Post-hoc test:
modhypo <- lmer(res_Nbr_methCpG_Nbr_coveredCpG_HYPO ~ Treatment + (1|brotherPairID/clutch.ID) + (1|Sex),
data = X)
## pairwise posthoc test
print(emmeans(modhypo, list(pairwise ~ Treatment), adjust = "tukey"))
mod3 <- lmer(res_Nbr_methCpG_Nbr_coveredCpG_HYPER ~ Treatment + (1|brotherPairID/clutch.ID) + (1|Sex),
data = X, REML = F)
mod4 <- lmer(res_Nbr_methCpG_Nbr_coveredCpG_HYPER ~ 1 + (1|brotherPairID/clutch.ID) + (1|Sex),
data = X, REML = F)
print(lrtest(mod3, mod4))
## Post-hoc test:
modhyper <- lmer(res_Nbr_methCpG_Nbr_coveredCpG_HYPER ~ Treatment + (1|brotherPairID/clutch.ID) + (1|Sex),
data = X)
## pairwise posthoc test
print(emmeans(modhyper, list(pairwise ~ Treatment), adjust = "tukey"))
## Plot
phypo <- plot(ggpredict(modhypo, terms = c("Treatment")), add.data = TRUE)+
scale_y_continuous("Residuals of number of hypomethylated methylated \ncytosines on number of cytosines covered") +
ggtitle("Predicted residuals nbr of hypomethylated CpG")+
theme_bw()
phyper <- plot(ggpredict(modhyper, terms = c("Treatment")), add.data = TRUE)+
scale_y_continuous("Residuals of number of hypermethylated methylated \n cytosines on number of cytosines covered") +
ggtitle("Predicted residuals nbr of hypermethylated CpG")+
theme_bw()
return(list(phypo, phyper))
}
listplots <- myfun(X = PM_G2[PM_G2$hypohyper %in% "hypo",])
## Likelihood ratio test
##
## Model 1: res_Nbr_methCpG_Nbr_coveredCpG_HYPO ~ Treatment + (1 | brotherPairID/clutch.ID) +
## (1 | Sex)
## Model 2: res_Nbr_methCpG_Nbr_coveredCpG_HYPO ~ 1 + (1 | brotherPairID/clutch.ID) +
## (1 | Sex)
## #Df LogLik Df Chisq Pr(>Chisq)
## 1 8 -420.75
## 2 5 -423.55 -3 5.605 0.1325
## $`emmeans of Treatment`
## Treatment emmean SE df lower.CL upper.CL
## NE_control -4.01 4.11 16.2 -12.72 4.70
## NE_exposed -5.64 4.14 16.3 -14.40 3.11
## E_control 7.45 4.12 16.1 -1.29 16.18
## E_exposed 4.61 4.12 16.1 -4.11 13.33
##
## Degrees-of-freedom method: kenward-roger
## Confidence level used: 0.95
##
## $`pairwise differences of Treatment`
## 1 estimate SE df t.ratio p.value
## NE_control - NE_exposed 1.63 2.55 93.08 0.640 0.9188
## NE_control - E_control -11.46 5.83 8.47 -1.966 0.2722
## NE_control - E_exposed -8.62 5.82 8.47 -1.482 0.4875
## NE_exposed - E_control -13.09 5.86 8.53 -2.234 0.1893
## NE_exposed - E_exposed -10.25 5.85 8.53 -1.754 0.3559
## E_control - E_exposed 2.84 2.50 92.53 1.135 0.6692
##
## Degrees-of-freedom method: kenward-roger
## P value adjustment: tukey method for comparing a family of 4 estimates
##
## Likelihood ratio test
##
## Model 1: res_Nbr_methCpG_Nbr_coveredCpG_HYPER ~ Treatment + (1 | brotherPairID/clutch.ID) +
## (1 | Sex)
## Model 2: res_Nbr_methCpG_Nbr_coveredCpG_HYPER ~ 1 + (1 | brotherPairID/clutch.ID) +
## (1 | Sex)
## #Df LogLik Df Chisq Pr(>Chisq)
## 1 8 -419.75
## 2 5 -422.58 -3 5.6616 0.1293
## $`emmeans of Treatment`
## Treatment emmean SE df lower.CL upper.CL
## NE_control 3.97 4.18 16.1 -4.88 12.82
## NE_exposed 5.61 4.20 16.2 -3.29 14.51
## E_control -7.52 4.19 16.0 -16.40 1.35
## E_exposed -4.53 4.18 16.1 -13.39 4.33
##
## Degrees-of-freedom method: kenward-roger
## Confidence level used: 0.95
##
## $`pairwise differences of Treatment`
## 1 estimate SE df t.ratio p.value
## NE_control - NE_exposed -1.64 2.51 93.04 -0.651 0.9148
## NE_control - E_control 11.50 5.92 8.37 1.942 0.2815
## NE_control - E_exposed 8.50 5.91 8.37 1.438 0.5112
## NE_exposed - E_control 13.13 5.95 8.43 2.208 0.1971
## NE_exposed - E_exposed 10.14 5.94 8.43 1.707 0.3771
## E_control - E_exposed -3.00 2.47 92.51 -1.216 0.6185
##
## Degrees-of-freedom method: kenward-roger
## P value adjustment: tukey method for comparing a family of 4 estimates
## NOT significant
annotate_figure(ggarrange(listplots[[1]], listplots[[2]],ncol = 2, nrow = 1),
top = text_grob("Parental DMS hypomethylated upon infection, in offspring"))
listplots <- myfun(X = PM_G2[PM_G2$hypohyper %in% "hyper",])
## Likelihood ratio test
##
## Model 1: res_Nbr_methCpG_Nbr_coveredCpG_HYPO ~ Treatment + (1 | brotherPairID/clutch.ID) +
## (1 | Sex)
## Model 2: res_Nbr_methCpG_Nbr_coveredCpG_HYPO ~ 1 + (1 | brotherPairID/clutch.ID) +
## (1 | Sex)
## #Df LogLik Df Chisq Pr(>Chisq)
## 1 8 -476.63
## 2 5 -482.69 -3 12.123 0.006974 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## $`emmeans of Treatment`
## Treatment emmean SE df lower.CL upper.CL
## NE_control 13.94 5.13 17.0 3.12 24.75
## NE_exposed 8.48 5.19 17.2 -2.46 19.42
## E_control -9.77 5.15 16.8 -20.66 1.11
## E_exposed -12.95 5.14 16.9 -23.79 -2.11
##
## Degrees-of-freedom method: kenward-roger
## Confidence level used: 0.95
##
## $`pairwise differences of Treatment`
## 1 estimate SE df t.ratio p.value
## NE_control - NE_exposed 5.46 4.45 93.8 1.225 0.6124
## NE_control - E_control 23.71 7.28 10.3 3.257 0.0353
## NE_control - E_exposed 26.88 7.26 10.3 3.701 0.0172
## NE_exposed - E_control 18.25 7.36 10.5 2.481 0.1209
## NE_exposed - E_exposed 21.43 7.33 10.5 2.923 0.0598
## E_control - E_exposed 3.17 4.37 93.0 0.726 0.8864
##
## Degrees-of-freedom method: kenward-roger
## P value adjustment: tukey method for comparing a family of 4 estimates
##
## Likelihood ratio test
##
## Model 1: res_Nbr_methCpG_Nbr_coveredCpG_HYPER ~ Treatment + (1 | brotherPairID/clutch.ID) +
## (1 | Sex)
## Model 2: res_Nbr_methCpG_Nbr_coveredCpG_HYPER ~ 1 + (1 | brotherPairID/clutch.ID) +
## (1 | Sex)
## #Df LogLik Df Chisq Pr(>Chisq)
## 1 8 -477.24
## 2 5 -483.28 -3 12.081 0.00711 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## $`emmeans of Treatment`
## Treatment emmean SE df lower.CL upper.CL
## NE_control -14.11 5.18 17.0 -25.04 -3.18
## NE_exposed -8.53 5.24 17.2 -19.58 2.52
## E_control 9.95 5.21 16.8 -1.05 20.95
## E_exposed 12.96 5.19 16.9 2.00 23.92
##
## Degrees-of-freedom method: kenward-roger
## Confidence level used: 0.95
##
## $`pairwise differences of Treatment`
## 1 estimate SE df t.ratio p.value
## NE_control - NE_exposed -5.58 4.47 93.8 -1.247 0.5989
## NE_control - E_control -24.06 7.36 10.3 -3.269 0.0348
## NE_control - E_exposed -27.07 7.34 10.3 -3.687 0.0177
## NE_exposed - E_control -18.48 7.43 10.4 -2.486 0.1204
## NE_exposed - E_exposed -21.49 7.41 10.4 -2.901 0.0622
## E_control - E_exposed -3.01 4.39 93.0 -0.686 0.9021
##
## Degrees-of-freedom method: kenward-roger
## P value adjustment: tukey method for comparing a family of 4 estimates
## Treatment SIGNIFICANT in both excess hypo/hyper methylation **
# $`pairwise differences of Treatment`
## HYPO
# 1 estimate SE df t.ratio p.value
# NE_control - E_control 23.71 7.28 10.3 3.257 0.0353
# NE_control - E_exposed 26.88 7.26 10.3 3.701 0.0172
## HYPER
# 1 estimate SE df t.ratio p.value
# NE_control - E_control -24.06 7.36 10.3 -3.269 0.0348
# NE_control - E_exposed -27.07 7.34 10.3 -3.687 0.0177
annotate_figure(ggarrange(listplots[[1]], listplots[[2]],ncol = 2, nrow = 1),
top = text_grob("Parental DMS hypermethylated upon infection, in offspring"))
-> The beta values in parentalDMS in offspring follow the parental pattern hypo/hyper methylated upon infection
intersect(paste(DMSlist$DMS_15pc_G1_C_T$chr, DMSlist$DMS_15pc_G1_C_T$start),
intersect(paste(DMSlist$DMS_15pc_CC_CT$chr, DMSlist$DMS_15pc_CC_CT$start),
paste(DMSlist$DMS_15pc_TC_TT$chr, DMSlist$DMS_15pc_TC_TT$start)))
## [1] "Gy_chrII 22196179" "Gy_chrXII 10863858" "Gy_chrXVII 2658079"
## [4] "Gy_chrXX 5344222"
## ONLY 4!!! "Gy_chrII 22196179" "Gy_chrXII 10863858" "Gy_chrXVII 2658079" "Gy_chrXX 5344222"
DMS1 = DMSlist$DMS_15pc_CC_CT # from: getDiffMeth(myuniteCov = uniteCov14_G2_woSexAndUnknowChr_G1CONTROL, myMetadata = fullMetadata_OFFS[fullMetadata_OFFS$trtG1G2_NUM %in% c(5,6),])
DMS2 = DMSlist$DMS_15pc_TC_TT # from: getDiffMeth(myuniteCov = uniteCov14_G2_woSexAndUnknowChr_G1INFECTED, myMetadata = fullMetadata_OFFS[fullMetadata_OFFS$trtG1G2_NUM %in% c(2,3),])
coreDMS <- intersect(paste(DMS1$chr, DMS1$start), paste(DMS2$chr, DMS2$start))
## prepare manhattan plot df
comp1 = "CC-CI"; comp2 = "IC-II"
A <- methylKit::select(DMS1, which(paste(DMS1$chr, DMS1$start) %in% coreDMS))
B <- as.data.frame(annotateWithGeneParts(as(A,"GRanges"),annotBed12)@members)
A2 <- methylKit::select(DMS2, which(paste(DMS2$chr, DMS2$start) %in% coreDMS))
B2 <- as.data.frame(annotateWithGeneParts(as(A2,"GRanges"),annotBed12)@members)
## Make Manhattan plot:
ggarrange(
makeManhattanPlots(DMSfile = DMS1,
annotFile = as.data.frame(annotateWithGeneParts(as(DMS1,"GRanges"),annotBed12)@members),
GYgynogff = GYgynogff, mycols = c("red", "grey", "black", "green"),
mytitle = paste0("Manhattan plot of ", comp1, " DMS")),
makeManhattanPlots(DMSfile = DMS2,
annotFile = as.data.frame(annotateWithGeneParts(as(DMS2,"GRanges"),annotBed12)@members),
GYgynogff = GYgynogff, mycols = c("red", "grey", "black", "green"),
mytitle = paste0("Manhattan plot of ", comp2, " DMS")),
makeManhattanPlots(DMSfile = A, annotFile = B, GYgynogff = GYgynogff,
mycols = c("red", "grey", "black", "green"), mytitle = paste0("Manhattan plot core DMS in ", comp1)),
makeManhattanPlots(DMSfile = A2, annotFile = B2, GYgynogff = GYgynogff,
mycols = c("red", "grey", "black", "green"), mytitle = paste0("Manhattan plot core DMS in ", comp2)),
labels = c("A", "B", "C", "D"), ncol = 1, nrow = 4, common.legend = T)
# Change the vector into a methobject:
df <- data.frame(chr=sapply(strsplit(coreDMS, " "), `[`, 1),
start=sapply(strsplit(coreDMS, " "), `[`, 2),
end=sapply(strsplit(coreDMS, " "), `[`, 2))
# get annotation
anotcore <- getAnnotationFun(makeGRangesFromDataFrame(df))
print(paste0("The number of genes associated with core DMS is:"))
## [1] "The number of genes associated with core DMS is:"
print(nrow(anotcore)) # number of genes
## [1] 21
# Reorder chromosome
anotcore <- anotcore %>%
mutate(chr = gsub("Gy_chr", "", seqid), chrom_nr = seqid %>% deroman(), chrom_order=factor(chrom_nr) %>%
as.numeric()) %>% arrange(chrom_order) %>%
mutate(geneLengthkb = (end - start)/1000, nCpGperGenekb = nCpG/geneLengthkb)
# Plot
ggplot(anotcore, aes(x=start, y = nCpGperGenekb)) + geom_point(alpha=.4) +
facet_grid(.~fct_inorder(chr)) +
geom_hline(yintercept = 1)+
theme(axis.text.x=element_blank()) +
xlab("Genes genes associated with core DMS") +
ylab("Number of differentially methylated CpG per gene kb")
## Genes with at least 1 CpG differentially methylated per kb: NONE
anotcore[anotcore$nCpGperGenekb >= 1,]
## [1] seqid source type start
## [5] end score strand phase
## [9] ID Name Alias Note
## [13] Dbxref Ontology_term Parent X_AED
## [17] X_QI X_eAED X_merge_warning aligned_coverage
## [21] aligned_identity score.1 target_length Target
## [25] nCpG chr chrom_nr chrom_order
## [29] geneLengthkb nCpGperGenekb
## <0 rows> (or 0-length row.names)
## For all genes:
geneNotes = anotcore$Note
# extract uniprot symbol from note, then uppercase
genes = sapply(geneNotes, function(x) sub("Similar to ", "", x) %>% str_extract(".*:") %>% str_remove(":") %>% toupper)
# Convert the uniprot gene names to entrez ids
topGenesDF = unlist(mapIds(org.Hs.eg.db, keys = genes, column = "ENTREZID", keytype = "SYMBOL")) %>% data.frame()
names(topGenesDF) = "ENTREZID" ; topGenesDF$GeneSymbol = rownames(topGenesDF) ; rownames(topGenesDF) = NULL
# Retrieve gene summary & descirption
genes = entrez_summary(db="gene", id=topGenesDF$ENTREZID)
SummaDF = lapply(genes, function(x) x[["summary"]]) %>% unlist() %>% data.frame()
names(SummaDF) = "Summary" ; SummaDF$ENTREZID = rownames(SummaDF) ; rownames(SummaDF) = NULL
DescDF = lapply(genes, function(x) x[["description"]]) %>% unlist() %>% data.frame()
names(DescDF) = "description" ; DescDF$ENTREZID = rownames(DescDF) ; rownames(DescDF) = NULL
# Output complete table
topGenesDF = merge(merge(DescDF, topGenesDF, all = T), SummaDF, all = T)
| ENTREZID | description | GeneSymbol | Summary |
|---|---|---|---|
| 100287898 | tetratricopeptide repeat domain 34 | TTC34 | |
| 10463 | solute carrier family 30 member 9 | SLC30A9 | |
| 10550 | ADP ribosylation factor like GTPase 6 interacting protein 5 | ARL6IP5 | Expression of this gene is affected by vitamin A. The encoded protein of this gene may be associated with the cytoskeleton. A similar protein in rats may play a role in the regulation of cell differentiation. The rat protein binds and inhibits the cell membrane glutamate transporter EAAC1. The expression of the rat gene is upregulated by retinoic acid, which results in a specific reduction in EAAC1-mediated glutamate transport. [provided by RefSeq, Jul 2008] |
| 10575 | chaperonin containing TCP1 subunit 4 | CCT4 | The chaperonin containing TCP1 (MIM 186980) complex (CCT), also called the TCP1 ring complex, consists of 2 back-to-back rings, each containing 8 unique but homologous subunits, such as CCT4. CCT assists the folding of newly translated polypeptide substrates through multiple rounds of ATP-driven release and rebinding of partially folded intermediate forms. Substrates of CCT include the cytoskeletal proteins actin (see MIM 102560) and tubulin (see MIM 191130), as well as alpha-transducin (MIM 139330) (Won et al., 1998 [PubMed 9819444]).[supplied by OMIM, Mar 2008] |
| 152573 | shisa family member 3 | SHISA3 | This gene encodes a single-transmembrane protein which is one of nine members of a family of transmembrane adaptors that modulate both WNT and FGF signaling by blocking the maturation and transport of their receptors to the cell surface. Members of this family contain an N-terminal cysteine-rich domain with a distinct cysteine pattern, a single transmembrane domain, and a C-terminal proline-rich, low complexity region. The encoded protein acts as a tumor suppressor by accelerating beta-catenin degradation. [provided by RefSeq, Jul 2017] |
| 200959 | gamma-aminobutyric acid type A receptor subunit rho3 | GABRR3 | The neurotransmitter gamma-aminobutyric acid (GABA) functions in the central nervous system to regulate synaptic transmission of neurons. This gene encodes one of three related subunits, which combine as homo- or hetero-pentamers to form GABA(C) receptors. In humans, some individuals contain a single-base polymorphism (dbSNP rs832032) that is predicted to inactivate the gene product. [provided by RefSeq, Jan 2012] |
| 2309 | forkhead box O3 | FOXO3 | This gene belongs to the forkhead family of transcription factors which are characterized by a distinct forkhead domain. This gene likely functions as a trigger for apoptosis through expression of genes necessary for cell death. Translocation of this gene with the MLL gene is associated with secondary acute leukemia. Alternatively spliced transcript variants encoding the same protein have been observed. [provided by RefSeq, Jul 2008] |
| 400745 | SH2 domain containing 5 | SH2D5 | |
| 4885 | neuronal pentraxin 2 | NPTX2 | This gene encodes a member of the family of neuronal petraxins, synaptic proteins that are related to C-reactive protein. This protein is involved in excitatory synapse formation. It also plays a role in clustering of alpha-amino-3-hydroxy-5-methyl-4-isoxazolepropionic acid (AMPA)-type glutamate receptors at established synapses, resulting in non-apoptotic cell death of dopaminergic nerve cells. Up-regulation of this gene in Parkinson disease (PD) tissues suggests that the protein may be involved in the pathology of PD. [provided by RefSeq, Feb 2009] |
| 51095 | tRNA nucleotidyl transferase 1 | TRNT1 | The protein encoded by this gene is a CCA-adding enzyme which belongs to the tRNA nucleotidyltransferase/poly(A) polymerase family. This essential enzyme functions by catalyzing the addition of the conserved nucleotide triplet CCA to the 3’ terminus of tRNA molecules. Mutations in this gene result in sideroblastic anemia with B-cell immunodeficiency, periodic fevers, and developmental delay. Alternative splicing results in multiple transcript variants. [provided by RefSeq, Dec 2014] |
| 5295 | phosphoinositide-3-kinase regulatory subunit 1 | PIK3R1 | Phosphatidylinositol 3-kinase phosphorylates the inositol ring of phosphatidylinositol at the 3-prime position. The enzyme comprises a 110 kD catalytic subunit and a regulatory subunit of either 85, 55, or 50 kD. This gene encodes the 85 kD regulatory subunit. Phosphatidylinositol 3-kinase plays an important role in the metabolic actions of insulin, and a mutation in this gene has been associated with insulin resistance. Alternative splicing of this gene results in four transcript variants encoding different isoforms. [provided by RefSeq, Jun 2011] |
| 5888 | RAD51 recombinase | RAD51 | The protein encoded by this gene is a member of the RAD51 protein family. RAD51 family members are highly similar to bacterial RecA and Saccharomyces cerevisiae Rad51, and are known to be involved in the homologous recombination and repair of DNA. This protein can interact with the ssDNA-binding protein RPA and RAD52, and it is thought to play roles in homologous pairing and strand transfer of DNA. This protein is also found to interact with BRCA1 and BRCA2, which may be important for the cellular response to DNA damage. BRCA2 is shown to regulate both the intracellular localization and DNA-binding ability of this protein. Loss of these controls following BRCA2 inactivation may be a key event leading to genomic instability and tumorigenesis. Multiple transcript variants encoding different isoforms have been found for this gene. [provided by RefSeq, Aug 2009] |
| 6456 | SH3 domain containing GRB2 like 2, endophilin A1 | SH3GL2 | |
| 7106 | tetraspanin 4 | TSPAN4 | The protein encoded by this gene is a member of the transmembrane 4 superfamily, also known as the tetraspanin family. Most of these members are cell-surface proteins that are characterized by the presence of four hydrophobic domains. The proteins mediate signal transduction events that play a role in the regulation of cell development, activation, growth and motility. This encoded protein is a cell surface glycoprotein and is similar in sequence to its family member CD53 antigen. It is known to complex with integrins and other transmembrane 4 superfamily proteins. Alternatively spliced transcript variants encoding different isoforms have been identified. [provided by RefSeq, Jul 2008] |
| 8501 | solute carrier family 43 member 1 | SLC43A1 | SLC43A1 belongs to the system L family of plasma membrane carrier proteins that transports large neutral amino acids (Babu et al., 2003 [PubMed 12930836]).[supplied by OMIM, Mar 2008] |
| 85460 | zinc finger protein 518B | ZNF518B | |
| 8601 | regulator of G protein signaling 20 | RGS20 | The protein encoded by this gene belongs to the family of regulator of G protein signaling (RGS) proteins, which are regulatory and structural components of G protein-coupled receptor complexes. RGS proteins inhibit signal transduction by increasing the GTPase activity of G protein alpha subunits, thereby driving them into their inactive GDP-bound forms. This protein selectively binds to G(z)-alpha and G(alpha)-i2 subunits, and regulates their signaling activities. Alternatively spliced transcript variants encoding different isoforms have been found for this gene. [provided by RefSeq, Sep 2011] |
| 91653 | BOC cell adhesion associated, oncogene regulated | BOC | The protein encoded by this gene is a member of the immunoglobulin/fibronectin type III repeat family. It is a component of a cell-surface receptor complex that mediates cell-cell interactions between muscle precursor cells, and promotes myogenic differentiation. Alternative splicing results in multiple transcript variants encoding different isoforms. [provided by RefSeq, Sep 2014] |
| 9663 | lipin 2 | LPIN2 | Mouse studies suggest that this gene functions during normal adipose tissue development and may play a role in human triglyceride metabolism. This gene represents a candidate gene for human lipodystrophy, characterized by loss of body fat, fatty liver, hypertriglyceridemia, and insulin resistance. [provided by RefSeq, Jul 2008] |
| NA | NA | MTNR1BB | NA |
DMS1 = DMSlist$DMS_15pc_CC_TC
DMS2 = DMSlist$DMS_15pc_CT_TT
transmDMS <- intersect(paste(DMS1$chr, DMS1$start), paste(DMS2$chr, DMS2$start))
length(transmDMS)
## [1] 257
## prepare manhattan plot df
comp1 = "CC_TC"; comp2 = "CT_TT"
A <- methylKit::select(DMS1, which(paste(DMS1$chr, DMS1$start) %in% transmDMS))
B <- as.data.frame(annotateWithGeneParts(as(A,"GRanges"),annotBed12)@members)
A2 <- methylKit::select(DMS2, which(paste(DMS2$chr, DMS2$start) %in% transmDMS))
B2 <- as.data.frame(annotateWithGeneParts(as(A2,"GRanges"),annotBed12)@members)
## Make Manhattan plot:
ggarrange(
makeManhattanPlots(DMSfile = DMS1,
annotFile = as.data.frame(annotateWithGeneParts(as(DMS1,"GRanges"),annotBed12)@members),
GYgynogff = GYgynogff, mycols = c("red", "grey", "black", "green"),
mytitle = paste0("Manhattan plot of ", comp1, " DMS")),
makeManhattanPlots(DMSfile = DMS2,
annotFile = as.data.frame(annotateWithGeneParts(as(DMS2,"GRanges"),annotBed12)@members),
GYgynogff = GYgynogff, mycols = c("red", "grey", "black", "green"),
mytitle = paste0("Manhattan plot of ", comp2, " DMS")),
makeManhattanPlots(DMSfile = A, annotFile = B, GYgynogff = GYgynogff,
mycols = c("red", "grey", "black", "green"), mytitle = paste0("Manhattan plot transmitted DMS in ", comp1)),
makeManhattanPlots(DMSfile = A2, annotFile = B2, GYgynogff = GYgynogff,
mycols = c("red", "grey", "black", "green"), mytitle = paste0("Manhattan plot transmitted DMS in ", comp2)),
labels = c("A", "B", "C", "D"), ncol = 1, nrow = 4, common.legend = T)
meanBeta_G2_simple <- PM_G2 %>% group_by(Chr, Pos, Treatment) %>%
dplyr::summarize(Mean = mean(BetaValue, na.rm=TRUE))
names(meanBeta_G2_simple) <- c("Chr", "Pos", "Treatment_G2", "MeanBetaG2")
genome <- GYgynogff %>%
mutate(chrom_nr=chrom %>% deroman(), chrom_order=factor(chrom_nr) %>% as.numeric()) %>%
arrange(chrom_order) %>%
mutate(gstart=lag(length,default=0) %>% cumsum(), gend=gstart+length,
type=LETTERS[2-(chrom_order%%2)], gmid=(gstart+gend)/2)
mydata = tibble(trt=meanBeta_G2_simple$Treatment_G2,
chrom=meanBeta_G2_simple$Chr, pos=meanBeta_G2_simple$Pos, beta=meanBeta_G2_simple$MeanBetaG2)
mydata$pos <- as.numeric(mydata$pos)
mydata$pos <- as.numeric(mydata$pos)
table(mydata$chrom)## check that chrXIX and chrUN are well removed!!
##
## Gy_chrI Gy_chrII Gy_chrIII Gy_chrIV Gy_chrIX Gy_chrV
## 756 600 584 1056 756 580
## Gy_chrVI Gy_chrVII Gy_chrVIII Gy_chrX Gy_chrXI Gy_chrXII
## 656 940 644 568 908 800
## Gy_chrXIII Gy_chrXIV Gy_chrXV Gy_chrXVI Gy_chrXVII Gy_chrXVIII
## 788 904 500 768 784 584
## Gy_chrXX Gy_chrXXI
## 888 528
# join DMS and genomic position
data = dplyr::left_join(mydata, genome) %>% dplyr::mutate(gpos=pos+gstart)
# plot:
ggplot()+
geom_rect(data=genome,aes(xmin=gstart,xmax=gend,ymin=-Inf,ymax=Inf,fill=type), alpha=.2)+
geom_point(data=data, aes(x=gpos,y=beta, shape=trt, col=trt),fill="white", size = 1)+
scale_color_manual(values = colOffs)+
scale_shape_manual(values=c(21,21,21,21))+
scale_fill_manual(values=c(A=rgb(.9,.9,.9),B=NA),guide="none")+
scale_x_continuous(breaks=genome$gmid,labels=genome$chrom %>% str_remove(.,"Gy_chr"),
position = "top",expand = c(0,0))+
theme_minimal()+
theme(panel.grid = element_blank(),
axis.line=element_blank(),
axis.title = element_blank(),
strip.placement = "outside")+
facet_grid(trt~.)+
ggtitle("Mean methylation proportions at the 3648 parental DMS for each offspring group")
meanBeta_G2_extended <- PM_G2 %>% group_by(Chr, Pos, Treatment, Sex, brotherPairID) %>%
dplyr::summarize(Mean = mean(BetaValue, na.rm=TRUE))
names(meanBeta_G2_extended) <- c("Chr", "Pos", "Treatment_G2","Sex", "brotherPairID", "MeanBetaG2")
mydata = tibble(trt=meanBeta_G2_extended$Treatment_G2, sex = meanBeta_G2_extended$Sex, brotherPairID = meanBeta_G2_extended$brotherPairID,
chrom=meanBeta_G2_extended$Chr, pos=meanBeta_G2_extended$Pos, beta=meanBeta_G2_extended$MeanBetaG2)
mydata$pos <- as.numeric(mydata$pos)
# join DMS and genomic position
data = dplyr::left_join(mydata, genome) %>% dplyr::mutate(gpos=pos+gstart)
## Add distance to center
data$dist2mid <- abs(data$gmid - data$gpos)
mod <- lmer(beta ~ dist2mid:chrom + (1|trt) + (1|sex) + (1|brotherPairID), data = data, REML = F)
mod0 <- lmer(beta ~ dist2mid + (1|trt) + (1|sex) + (1|brotherPairID), data = data, REML = F)
mod00 <- lmer(beta ~ chrom + (1|trt) + (1|sex) + (1|brotherPairID), data = data, REML = F)
lrtest(mod, mod0) # chromosome matters
## Likelihood ratio test
##
## Model 1: beta ~ dist2mid:chrom + (1 | trt) + (1 | sex) + (1 | brotherPairID)
## Model 2: beta ~ dist2mid + (1 | trt) + (1 | sex) + (1 | brotherPairID)
## #Df LogLik Df Chisq Pr(>Chisq)
## 1 25 -884444
## 2 6 -884775 -19 661.57 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
lrtest(mod0, mod00) # distance to middle matters
## Likelihood ratio test
##
## Model 1: beta ~ dist2mid + (1 | trt) + (1 | sex) + (1 | brotherPairID)
## Model 2: beta ~ chrom + (1 | trt) + (1 | sex) + (1 | brotherPairID)
## #Df LogLik Df Chisq Pr(>Chisq)
## 1 6 -884775
## 2 24 -884404 18 740.64 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## check normality of residuals assumption
qqnorm(resid(mod))
qqline(resid(mod)) # quite skewed
pred <- ggpredict(mod, terms = c("dist2mid","chrom"))
plot(pred) +
scale_color_manual(values = 1:20)